In [724]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from datetime import datetime


import streamlit as st
from joblib import dump

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

from catboost import CatBoostRegressor

from transformers import pipeline
qa_pipeline = pipeline('question-answering', model='timpal0l/mdeberta-v3-base-squad2')


import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
%matplotlib inline

## Препроцессинг и генерация признаков

In [883]:
def preprocessing(df=df):
    df.loc[:, ['is_complete']] = df.apply(lambda x:
    int(x.completion_year) < 2024 if pd.isna(x.is_complete) and not pd.isna(x.completion_year) else x.is_complete,
    axis=1)

    df.loc[:, ['is_complete']] = df.apply(lambda x: 
    int(x.build_date) < 2024 if pd.isna(x.is_complete) and not pd.isna(x.build_date) else x.is_complete, 
    axis=1)
    
    df.loc[:, ['is_complete']] = df.apply(lambda x: 
    x.is_rosreestr_checked if pd.isna(x.is_complete) and not pd.isna(x.is_rosreestr_checked) else x.is_complete, 
    axis=1)
    
    df.loc[:, ['is_complete']] = df.apply(lambda x: 
    x.from_developer == 0 if pd.isna(x.is_complete) and not pd.isna(x.from_developer) else x.is_complete, 
    axis=1) 
    
    df.loc[:, ['is_complete']] = df.apply(lambda x:
    int(x.is_auction) if pd.isna(x.is_complete) else x.is_complete, 
    axis=1)
    
    df.loc[:, ['decoration']] = df.apply(lambda x:
    'without' if pd.isna(x.decoration) and not x.is_complete else x.decoration, 
    axis=1)
    
    df.loc[:, ['decoration']] = df.apply(lambda x:
    'rough' if pd.isna(x.decoration) and x.is_auction
    else x.decoration,
    axis=1)
    
    df.loc[:, ['decoration']] = df.apply(lambda x:
    'fine' if pd.isna(x.decoration) and x.is_apartments and x.is_complete else x.decoration, 
    axis=1)
    
    df.loc[:, ['house_material']] = df.apply(lambda x:
    'panel' if pd.isna(x.house_material) and not pd.isna(x.build_date)
    and 1960 <= int(x.build_date) <= 1991 else x.house_material, 
    axis=1)
    
    df.loc[:, ['house_material']] = df.apply(lambda x:
    'brick' if pd.isna(x.house_material) and not pd.isna(x.build_date)
    and 1900 <= int(x.build_date) <= 1960 else x.house_material, 
    axis=1)

    df.loc[df['house_material'] == 'stalin', 'house_material'] = 'brick';
    df.loc[df['house_material'] == 'gasSilicateBlock', 'house_material'] = 'block';
    df.loc[df['house_material'] == 'aerocreteBlock', 'house_material'] = 'block';
    
    df.loc[:, ['passenger_elevator']] = df.apply(lambda x:
    0 if pd.isna(x.passenger_elevator) 
    and x.floors_number in np.arange(1, 4) else x.passenger_elevator, 
    axis=1)

    df.loc[:, ['passenger_elevator']] = df.apply(lambda x:
    0 if pd.isna(x.passenger_elevator) 
    and x.house_material in ('brick', 'panel') 
    and x.floors_number == 5 else x.passenger_elevator, 
    axis=1)

    df.loc[:, ['passenger_elevator']] = df.apply(lambda x:
    1 if pd.isna(x.passenger_elevator) 
    and x.floors_number in np.arange(6, 11) else x.passenger_elevator,
    axis=1)

    df.loc[:, ['passenger_elevator']] = df.apply(lambda x:
    2 if pd.isna(x.passenger_elevator) 
    and x.floors_number in np.arange(11, 21) else x.passenger_elevator, 
    axis=1)

    df.loc[:, ['passenger_elevator']] = df.apply(lambda x:
    3 if pd.isna(x.passenger_elevator) 
    and x.floors_number in np.arange(21, 26) else x.passenger_elevator, 
    axis=1)
    
    df.loc[:, ['passenger_elevator']] = df.apply(lambda x:
    4 if pd.isna(x.passenger_elevator) 
    and x.floors_number in np.arange(26, 100) else x.passenger_elevator, 
    axis=1)
    
    df.loc[:, ['cargo_elevator']] = df.apply(lambda x:
    0 if pd.isna(x.cargo_elevator) 
    and x.floors_number in np.arange(1, 10) else x.cargo_elevator, 
    axis=1)

    df.loc[:, ['cargo_elevator']] = df.apply(lambda x:
    1 if pd.isna(x.cargo_elevator) 
    and x.floors_number in np.arange(10, 13)
    and x.passenger_elevator == 0 else x.cargo_elevator, 
    axis=1)

    df.loc[:, ['cargo_elevator']] = df.apply(lambda x:
    1 if pd.isna(x.cargo_elevator) 
    and x.floors_number in np.arange(13, 23)
    and x.passenger_elevator in (1, 2) else x.cargo_elevator, 
    axis=1)

    df.loc[:, ['cargo_elevator']] = df.apply(lambda x:
    2 if pd.isna(x.cargo_elevator) 
    and x.floors_number in np.arange(23, 30)
    and x.passenger_elevator in (1, 2) else x.cargo_elevator, 
    axis=1)
        
    df.loc[:, ['parking']] = df.apply(lambda x:
    'ground' if pd.isna(x.parking) else x.parking, 
    axis=1)
    
    df.loc[:, ['balcony']] = df.apply(lambda x:
    0 if pd.isna(x.balcony) and x.is_apartments else x.balcony,
    axis=1)
    
    df.loc[:, ['balcony']] = df.apply(lambda x:
    1 if pd.isna(x.balcony) and x.rooms_count in np.arange(1, 3) else x.balcony,
    axis=1)
    
    df.loc[:, ['balcony']] = df.apply(lambda x:
    2 if pd.isna(x.balcony) and x.rooms_count >= 3 else x.balcony,
    axis=1)

    df.loc[:, ['kitchen_area']] = df.apply(lambda x:
    6.2 if pd.isna(x.kitchen_area) and x.house_material == 'panel' 
       and x.is_complete 
       and x.rooms_count <= 3 and x.total_area < 50 else x.kitchen_area,
    axis=1)
    
    df.loc[:, ['living_area']] = df.apply(lambda x:
    float(x.total_area) - float(x.kitchen_area) if pd.isna(x.living_area) and not pd.isna(x.kitchen_area) else x.living_area
    , axis=1)
    
    df.loc[:, ['kitchen_area']] = df.apply(lambda x:
    float(x.total_area) - float(x.living_area) if pd.isna(x.kitchen_area) and not pd.isna(x.living_area) else x.kitchen_area,
    axis=1)
    
    return df


def feature_engineering(df=df):
    df['build_date'] = pd.to_numeric(df['build_date'], errors='coerce')
    df['house_age'] = 2024 - df['build_date']

    df.loc[:, ['house_age']] = df.apply(lambda x:
     -1 if pd.isna(x.house_age) and x.is_complete == 0 else x.house_age, 
    axis=1)

    df.loc[:, ['house_age']] = df.apply(lambda x:
    2024 - int(x.completion_year) if pd.isna(x.house_age) and x.is_complete == 1 else x.house_age, 
    axis=1)
    
    df.loc[:, ['is_first_floor']] = df.apply(lambda x:
    1 if x.floor == 1 else 0, axis=1)
    
    df.loc[:, ['is_last_floor']] = df.apply(lambda x:
    1 if x.floor == x.floors_number else 0, axis=1)
    
    df.loc[:, ['has_metro']] = df.apply(lambda x:
    0 if pd.isna(x.metro) else 1, axis=1)
    
    df.loc[:, ['mean_metro']] = df.apply(lambda x:
    0 if x.has_metro == 0 else sum(map(float, x.metro_distance.split(',')))/3, 
    axis=1)
    
    df['metro_dist'] = df.apply(calculate_mean_distance, axis=1)
    
    df['district'] = df['district'].astype('str')
    df['rate'] = df.apply(lambda row: apply_rating(row['district'], globals()[f'rate_{row["region"]}']), 
    axis=1)
        
    return df


def calculate_mean_distance(row):
    distances = [int(x) for x in str(row['metro_distance']).split(',') if x.isdigit()]
    transports = str(row['metro_transport']).split(',')
    
    total_distance = 0
    total_transport_time = 0
    
    for i in range(len(distances)):
        if transports[i] == 'walk':
            total_distance += distances[i] / 60 * 5  # Скорость ходьбы: ~5 км/час
        elif transports[i] == 'transport':
            total_distance += distances[i] / 60 * 40  # Скорость транспорта: ~40 км/час
    
    if len(distances) > 0:
        return total_distance / len(distances)
    else:
        return None


rate_msk = {
  'ЦАО' : 1,
  'ЗАО' : 2,
  'ЮАО' : 6,
  'САО' : 5,
  'ЮЗАО' : 3,
  'ВАО' : 8,
  'СЗАО' : 4,
  'СВАО' : 7,
  'ЮВАО' : 9,
  'ТАО (Троицкий)' : 12,
  'НАО (Новомосковский)' : 11,
  'Московский,НАО (Новомосковский)' : 10,
  'Крюково,ЗелАО' : 13
}

rate_spb = {
    'Красногвардейский' : 16,
    'Петроградский' : 7,
    'Выборгский' : 15,
    'Московский' : 4,
    'Василеостровский' : 10,
    'Адмиралтейский' : 12,
    'Центральный' : 1,
    'Курортный' : 14,
    'Фрунзенский' : 17,
    'Приморский' : 5,
    'Невский' : 3,
    'Калининский' : 6,
    'Красносельский' : 13,
    'Парнас,Выборгский' : 15,
    'Петродворцовый' : 8,
    'Пушкинский' : 9,
    'Кировский' : 2,
    'Колпинский' : 11
}

rate_ekb = {
    'Железнодорожный' : 8,
    'Орджоникидзевский' : 6,
    'Кировский' : 7,
    'Ленинский' : 3,
    'Чкаловский' : 2,
    'Верх-Исетский' : 5,
    'Октябрьский' : 4,
    'Академический' : 3, #часть ленинского
    'Центр' : 1,
    'Автовокзал' : 3, #часть ленинского
    'Юго-Западный' : 3 #часть ленинского
}

rate_nsk = {
    'Заельцовский' : 4,
    'Ленинский' : 6,
    'Октябрьский' : 10, #нет инфы про него как-то
    'Железнодорожный' : 7,
    'Кировский' : 8,
    'Калининский' : 5,
    'Дзержинский' : 9,
    'Советский' : 1,
    'Первомайский' : 2,
    'Центральный' : 3,
}

rate_kzn = {
    'Московский' : 6,
    'Приволжский' : 5,
    'Советский' : 3,
    'Ново-Савиновский' : 4,
    'Кировский' : 1,
    'Вахитовский' : 2,
    'Авиастроительный' : 7
}

rate_nng = {
    'Канавинский' : 5, 
    'Советский' : 3, 
    'Нижегородский' : 1, 
    'Приокский' : 2,
    'Ленинский' : 6, 
    'Московский' : 7, 
    'Сормовский' : 8, 
    'Автозаводский' : 4
}


def apply_rating(district, rate_dict):
    districts = district.split(',')
    for d in districts:
        d = d.strip()  # Удаляем лишние пробелы
        if d in rate_dict:
            return rate_dict[d]
    return None  # Если район отсутствует в словаре


def imputing_encoding(df=df, prediction=False):
    num_f = [
        'total_area',
        'floor',
        'floors_number',
        'house_age',
        'living_area',
        'kitchen_area',
        'longitude',
        'latitude',
        'mean_metro',
        'metro_dist',
    ]

    cat_f = [
        'region',
        'decoration',
        'balcony',
        'rooms_count',
        'is_complete',
        'house_material',
        'parking',
        'is_apartments',
        'is_auction',
        'from_developer',
        'is_first_floor',
        'is_last_floor',
        'has_metro',
        'rate',
        'passenger_elevator',
        'cargo_elevator',
    ]

    df[cat_f] = df[cat_f].astype('str')
    for f in num_f:
        df[f] = pd.to_numeric(df[f], errors='coerce')
        
    num_imp = KNNImputer(n_neighbors=5)
    cat_imp = SimpleImputer(strategy='most_frequent')

    if prediction:
        df = df[num_f+cat_f].copy()
        if len(df) > 5:
            for f in num_f:
                df[f] = StandardScaler().fit_transform(pd.DataFrame(num_imp.fit_transform(df[[f]])))
        else:
            df[num_f] = StandardScaler().fit_transform(df[num_f])
        
        for f in cat_f:
            df[f] = OrdinalEncoder().fit_transform(pd.DataFrame(cat_imp.fit_transform(df[[f]])))
        return df

    else: 
        target = ['price']
        df = df[target+num_f+cat_f].copy()

        for f in num_f:
            df[f] = StandardScaler().fit_transform(pd.DataFrame(num_imp.fit_transform(df[[f]])))

        for f in cat_f:
            df[f] = OrdinalEncoder().fit_transform(pd.DataFrame(cat_imp.fit_transform(df[[f]])))

        df.to_csv('preprocessed_data.csv')
        return df


def modeling(df=df, cross_val=False):
    X = df.copy().drop(['price'], axis=1)
    y = df['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
    
    cat = CatBoostRegressor(silent=True, eval_metric='RMSE')
    model = cat.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    if cross_val:
        cv_score = np.sqrt((-1)*cross_val_score(cat, X_train, y_train, scoring='neg_mean_squared_error', cv=5))
        print(cv_score)

    return rmse, r2, model

In [930]:
def plot_with_prediction_highlight(df, prediction_df):
    sns.set(style="whitegrid")
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='total_area', y='price', data=df, color='blue', alpha=0.6, label='Общая выборка')

    sns.scatterplot(x='total_area', y='price_pred', data=prediction_df, color='red', s=100, label='Предсказание', edgecolor='black', zorder=5)
    
    plt.title('Общая площадь и цена')
    plt.xlabel('Общая площадь (кв.м)')
    plt.ylabel('Цена, руб.')
    plt.legend()
    plt.show()
    
def plot_dist_with_prediction(df, prediction_df):
    sns.set(style="whitegrid")
    
    plt.figure(figsize=(10, 6))
    sns.histplot(df['price'], kde=True, color="blue", label='Распределение цены', bins=30)
    
    for _, row in prediction_df.iterrows():
        plt.axvline(x=row['price_pred'], color='red', linestyle='--', linewidth=2, label='Предсказанная цена')
    
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    
    plt.title('Распределение цены и предсказанная цена')
    plt.xlabel('Цена, руб.')
    plt.ylabel('Плотность')
    plt.show()

def compare_price_to_region(df, prediction_df, region):
    regional_prices = df[df['region'] == region]['price']

    predicted_price = prediction_df['price_pred'].iloc[0]

    cheaper_than_predicted = regional_prices[regional_prices < predicted_price].count()
    percentage_cheaper = (cheaper_than_predicted / regional_prices.count()) * 100

    regions = {
        'msk': 'Москве',
        'spb': 'Санкт-Петербургу',
        'ekb': 'Екатеринбургу',
        'nsk': 'Новосибирску',
        'nng': 'Нижнему Новгороду',
        'kzn': 'Казани',
    }

    if percentage_cheaper > 50:
        message = f"Ваша квартира дороже, чем {percentage_cheaper:.2f}% квартир в выборке по {regions[region]}."
    else:
        message = f"Ваша квартира дешевле, чем {100-percentage_cheaper:.2f}% квартир в выборке по {regions[region]}."

    return message


def predict_price(model, prediction_df):
    pred_df1 = preprocessing(prediction_df)
    pred_df2 = feature_engineering(pred_df1)
    pred_df3 = imputing_encoding(pred_df2, prediction=True)
    y_pred = model.predict(pred_df3)
    prediction_df['price_pred'] = y_pred

    if len(pred_df3) > 1:
        prediction_df.to_csv('prediction_results.csv')
        return 'Результаты сохранены в prediction_results.csv!'
        
    if len(pred_df3) == 1:
        plot_with_prediction_highlight(df, prediction_df)
        plot_dist_with_prediction(df, prediction_df)
        print(compare_price_to_region(df, prediction_df, prediction_df.iloc[0]['region']))
    return prediction_df

In [931]:
df = pd.read_csv('./csv/data.csv', index_col='cian_id')
df = df.drop(['Unnamed: 0'], axis=1)
df = df.replace('empty', np.nan)

df1 = preprocessing(df)
df2 = feature_engineering(df1)
df3 = imputing_encoding(df2)
model_results = modeling(df3) # (5564187.147870915, 0.9111942523805219)
model = model_results[-1]
model_results[:2]

(5048064.937906024, 0.91756754190788)

In [933]:
test = pd.read_csv('./csv/test_samples.csv')
price = test['price']
test = test.drop(['Unnamed: 0', 'price'], axis=1)
test = test.replace('empty', np.nan)

predict_price(model, test)

'Результаты сохранены в prediction_results.csv!'

In [874]:
df = pd.read_csv('./csv/data.csv', index_col='cian_id')
df = df.drop(['Unnamed: 0'], axis=1)
df = df.replace('empty', np.nan)

clusters = {
    '1': df[df['rooms_count']==1],
    '2': df[df['rooms_count']==2],
    '3': df[df['rooms_count']==3],
    '4': df[df['rooms_count']==4],
    '5': df[df['rooms_count']==5],
    '6': df[df['rooms_count']==6],
}

for c, data in clusters.items():
    data1 = preprocessing(data)
    data2 = feature_engineering(data1)
    data3 = imputing_encoding(data2)
    model = modeling(data3)
    print(f'Room cluster: {c} with RMSE={model[0]:.2f}, R2={model[1]:.2f}')
    
# Room cluster: 1 with RMSE=1793393.66, R2=0.92
# Room cluster: 2 with RMSE=5136147.64, R2=0.85
# Room cluster: 3 with RMSE=7772741.65, R2=0.86
# Room cluster: 4 with RMSE=13158707.53, R2=0.81
# Room cluster: 5 with RMSE=19169449.41, R2=0.54
# Room cluster: 6 with RMSE=18136221.78, R2=0.67

Room cluster: 1 with RMSE=1824671.76, R2=0.92
Room cluster: 2 with RMSE=5205360.92, R2=0.84
Room cluster: 3 with RMSE=7654041.86, R2=0.87
Room cluster: 4 with RMSE=12864814.27, R2=0.81
Room cluster: 5 with RMSE=19205715.28, R2=0.54
Room cluster: 6 with RMSE=17709096.19, R2=0.68


In [934]:
pd.read_csv('prediction_results.csv')

Unnamed: 0.1,Unnamed: 0,cian_id,creation_date,region,address,description,total_area,kitchen_area,living_area,rooms_count,floor,floors_number,build_date,is_complete,completion_year,house_material,parking,decoration,balcony,longitude,latitude,passenger_elevator,cargo_elevator,metro,metro_distance,metro_transport,district,is_apartments,is_auction,from_developer,is_rosreestr_checked,house_age,is_first_floor,is_last_floor,has_metro,mean_metro,metro_dist,rate,price_pred
0,0,299093643,2024-02-29T12:26:17.823,msk,"Москва, Дубининская улица, 59к6",О квартире:\n\n2-комн. квартира с первичной от...,58,34.0,24.0,2,3,27,,0,2024,,ground,without,1,38,56,4,,"Серпуховская,Павелецкая,Добрынинская",18204.0,"walk,walk,transport","Даниловский,ЮАО",0,0,1,0,-1,0,0,1,14,2.0,6,28703986
1,1,297849170,2024-01-29T16:40:02.273,spb,"Санкт-Петербург, Заневский проспект, 65А",Продается 1-комн. апартамент на 11-м этаже в А...,46,27.0,15.0,1,11,15,,0,2024,,ground,fine,0,30,60,2,1.0,,,,"Малая Охта,Красногвардейский",1,0,1,0,-1,0,0,0,0,,16,20435165
2,2,296793202,2023-12-28T05:37:35.423,ekb,"Свердловская область, Екатеринбург, жилой райо...","Продаётся 2-комн. квартира площадью 45,4 кв.м ...",45,15.0,21.0,2,2,24,,0,2025,monolith,open,without,1,61,57,4,4.0,"Ботаническая,Чкаловская,Геологическая",1435.0,"walk,transport,transport",Октябрьский,0,0,1,0,-1,0,0,1,7,2.0,4,5347206
3,3,296750701,2023-12-26T19:09:01.67,nsk,"Новосибирск, улица Аэропорт",Квартира с кухней-гостиной и одной спальней в ...,41,,,2,4,15,,0,2025,monolith,ground,without,1,83,55,1,1.0,"Заельцовская,Гагаринская,Сибирская",357.0,"transport,transport,transport","Аэропорт,Заельцовский",0,0,1,0,-1,0,0,1,5,3.0,4,3305866
4,4,297261879,2024-01-15T17:05:26.773,nng,"Нижний Новгород, улица Коперника",Продаётся 2-комнатная квартира в строящемся до...,52,,,2,8,18,,0,2025,monolithBrick,ground,without,1,44,56,2,1.0,"Буревестник,Бурнаковская,Канавинская",7910.0,"transport,transport,transport","Кооперативный поселок,Сормовский",0,0,1,0,-1,0,0,1,9,6.0,8,11971148
5,5,297224202,2024-01-15T05:48:30.097,kzn,"Республика Татарстан, Казань, жилой комплекс Н...",Продаётся 2-комн. квартира площадью 53 кв.м на...,53,18.0,23.0,2,5,17,,0,2025,panel,open,without,1,49,56,2,,"Горки,Аметьево,Суконная слобода",899.0,"transport,transport,transport","Малые Клыки,Советский",0,0,1,0,-1,0,0,1,9,6.0,3,14926304
