# **Машинное обучение ИБ-2024**

# **Домашнее задание 1.**
# Регрессия, KNN, LinearRegression.

В данной домашней работе мы будем строить модели для предсказания цены квартиры в России. Ниже приведено описание некоторых колонок набора данных.

date - дата публикации объявления

price - цена в рублях

level- этаж, на котором находится квартира

levels - количество этажей в квартире

rooms - количество комнат в квартире. Если значение -1, то квартира считается апартаментами.

area - площадь квартиры.

kitchen_area - площадь кухни.

geo_lat - Latitude

geo_lon - Longitude

building_type - материал застройки. 0 - Don't know. 1 - Other. 2 - Panel. 3 - Monolithic. 4 - Brick. 5 - Blocky. 6 - Wooden

# Часть 0. Начало работы

Для начала работы с данными импортируем библиотеки, которые понадобятся в данном задании.

In [38]:
%pip install pandas numpy matplotlib scikit-learn seaborn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\vanya\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [39]:
import math
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn
import seaborn as sns

Загрузим библиотеку folium для отображения данных на карте по координатам.

In [40]:
%pip install folium

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\vanya\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Загрузим данные из csv файла в датафрейм.

In [41]:
df = pd.read_csv('input_data.csv', sep=';')

df

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,geo_lat,geo_lon,building_type,object_type,postal_code,street_id,id_region,house_id
0,2021-01-01,2451300,15,31,1,30.3,0.0,56.780112,60.699355,0,2,620000.0,,66,1632918.0
1,2021-01-01,1450000,5,5,1,33.0,6.0,44.608154,40.138381,0,0,385000.0,,1,
2,2021-01-01,10700000,4,13,3,85.0,12.0,55.540060,37.725112,3,0,142701.0,242543.0,50,681306.0
3,2021-01-01,3100000,3,5,3,82.0,9.0,44.608154,40.138381,0,0,385000.0,,1,
4,2021-01-01,2500000,2,3,1,30.0,9.0,44.738685,37.713668,3,2,353960.0,439378.0,23,1730985.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11358145,2021-12-31,6099000,4,9,3,65.0,0.0,56.041539,92.753133,0,0,660030.0,581436.0,24,857003.0
11358146,2021-12-31,2490000,1,10,2,56.9,0.0,55.169949,61.519210,0,0,454079.0,274414.0,74,1820769.0
11358147,2021-12-31,850000,2,2,2,37.0,5.0,55.946206,43.088179,0,0,606101.0,190983.0,52,958329.0
11358148,2021-12-31,4360000,5,5,1,36.0,9.0,61.256383,73.435919,0,0,628406.0,581702.0,86,2156710.0


Отобразим на карте координаты наших построек.

In [42]:
import folium
from IPython.display import display

map_df = df.loc[:1000]

m = folium.Map(location=[55.751244, 37.618423], zoom_start=10)


lats = map_df['geo_lat'].loc[:1000]
longs = map_df['geo_lon'].loc[:1000]

for point in zip(lats, longs):
    folium.Marker(
        location=[point[0], point[1]]
    ).add_to(m)

display(m)

# Часть 1. Подготовим данные для обработки моделями машинного обучения.

**0.5 Балл**. География наших наблюдений в наборе данных крайне большая. Однако мы знаем, что стоимость квартир в Москве и Санкт-Петербурге намного выше, чем в среднем по России. Давайте сделаем признаки, который показывают, находится ли квартира в 20 килиметрах от центра Москвы или находится ли квартира в 20 килиметрах от центра Санкт-Петербурга.

Создайте два признака is_Moscow и is_Saint_Peterburg. Для нахождения расстояния по координатам используйте функцию haversine_distance.

In [43]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c

MOSCOW_CENTER = (55.7558, 37.6176)
SPB_CENTER = (59.9343, 30.3351)

df['is_Moscow'] = df.apply(
    lambda row: haversine_distance(row['geo_lat'], row['geo_lon'], *MOSCOW_CENTER) <= 20, axis=1
)
df['is_Saint_Peterburg'] = df.apply(
    lambda row: haversine_distance(row['geo_lat'], row['geo_lon'], *SPB_CENTER) <= 20, axis=1
)

df

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,geo_lat,geo_lon,building_type,object_type,postal_code,street_id,id_region,house_id,is_Moscow,is_Saint_Peterburg
0,2021-01-01,2451300,15,31,1,30.3,0.0,56.780112,60.699355,0,2,620000.0,,66,1632918.0,False,False
1,2021-01-01,1450000,5,5,1,33.0,6.0,44.608154,40.138381,0,0,385000.0,,1,,False,False
2,2021-01-01,10700000,4,13,3,85.0,12.0,55.540060,37.725112,3,0,142701.0,242543.0,50,681306.0,False,False
3,2021-01-01,3100000,3,5,3,82.0,9.0,44.608154,40.138381,0,0,385000.0,,1,,False,False
4,2021-01-01,2500000,2,3,1,30.0,9.0,44.738685,37.713668,3,2,353960.0,439378.0,23,1730985.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11358145,2021-12-31,6099000,4,9,3,65.0,0.0,56.041539,92.753133,0,0,660030.0,581436.0,24,857003.0,False,False
11358146,2021-12-31,2490000,1,10,2,56.9,0.0,55.169949,61.519210,0,0,454079.0,274414.0,74,1820769.0,False,False
11358147,2021-12-31,850000,2,2,2,37.0,5.0,55.946206,43.088179,0,0,606101.0,190983.0,52,958329.0,False,False
11358148,2021-12-31,4360000,5,5,1,36.0,9.0,61.256383,73.435919,0,0,628406.0,581702.0,86,2156710.0,False,False


**0.5 Балла**. В нашем наборе данных есть признаки, которые мы теоретически можем использовать, например postal_code, но мы это будем делать в рамках домашней работы очень-очень долго. Поэтому предлагается удалить ненужные признаки из датафрейма.

Удалим geo_lat,	geo_lon,	object_type,	postal_code,	street_id,	id_region,	house_id.

In [44]:
df.drop(['geo_lat', 'geo_lon', 'object_type', 'postal_code', 'street_id', 'id_region', 'house_id'], axis=1, inplace=True)

df

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,building_type,is_Moscow,is_Saint_Peterburg
0,2021-01-01,2451300,15,31,1,30.3,0.0,0,False,False
1,2021-01-01,1450000,5,5,1,33.0,6.0,0,False,False
2,2021-01-01,10700000,4,13,3,85.0,12.0,3,False,False
3,2021-01-01,3100000,3,5,3,82.0,9.0,0,False,False
4,2021-01-01,2500000,2,3,1,30.0,9.0,3,False,False
...,...,...,...,...,...,...,...,...,...,...
11358145,2021-12-31,6099000,4,9,3,65.0,0.0,0,False,False
11358146,2021-12-31,2490000,1,10,2,56.9,0.0,0,False,False
11358147,2021-12-31,850000,2,2,2,37.0,5.0,0,False,False
11358148,2021-12-31,4360000,5,5,1,36.0,9.0,0,False,False


**0.5 Балл**. Для начала Вам предлагается проанализировать Ваши оставшиеся признаки (колонки) в наборе данных. Какие колонки категориальные? Какие числовые?

Категориальные: building_type, is_Moscow, is_Saint_Peterburg

Числовые: price, level, levels, rooms, area, kitchen_area

Давайте закодируем категориальные признаки с помощью OneHot-Encoding. https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [45]:
from sklearn.preprocessing import OneHotEncoder

cat_feats = ['building_type', 'is_Moscow', 'is_Saint_Peterburg']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded = encoder.fit_transform(df[cat_feats])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_feats))

df = pd.concat([df.drop(cat_feats, axis=1), encoded_df], axis=1)

df

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,building_type_1,building_type_2,building_type_3,building_type_4,building_type_5,building_type_6,is_Moscow_True,is_Saint_Peterburg_True
0,2021-01-01,2451300,15,31,1,30.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-01-01,1450000,5,5,1,33.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-01-01,10700000,4,13,3,85.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2021-01-01,3100000,3,5,3,82.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-01-01,2500000,2,3,1,30.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11358145,2021-12-31,6099000,4,9,3,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11358146,2021-12-31,2490000,1,10,2,56.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11358147,2021-12-31,850000,2,2,2,37.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11358148,2021-12-31,4360000,5,5,1,36.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**0.5 Балл**. Поработаем с числовыми признаками:


1.   Добавьте в ваш датасет два признака: количество дней со дня первого наблюдения (разница между датами объявлений). Возможно, для предсказания цены не так важен этаж, как важно отношение этажа квартиры на количество этажей в доме, добавьте этот признак. После добавления нового признака колонку date можно удалить.
2.   Числовые признаки могут иметь разные порядки. Давайте отнормируем числовые признаки с помощью StandartScaller https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html.



In [46]:
from sklearn.preprocessing import StandardScaler


df['date'] = pd.to_datetime(df['date'])


first_date = df['date'].min()
df['days_since_first_observation'] = (df['date'] - first_date).dt.days


df = df[df['levels'] != 0]
df.dropna(subset=['levels', 'level'], inplace=True)
df['floor_ratio'] = df['level'] / df['levels']

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.drop('date', axis=1, inplace=True)


numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
df.dropna(subset=numerical_features, inplace=True)


scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['levels', 'level'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['floor_ratio'] = df['level'] / df['levels']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

Unnamed: 0,price,level,levels,rooms,area,kitchen_area,building_type_1,building_type_2,building_type_3,building_type_4,building_type_5,building_type_6,is_Moscow_True,is_Saint_Peterburg_True,days_since_first_observation,floor_ratio
0,-0.021928,1.622845,2.665137,-0.621436,-0.840594,0.082515,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,-1.858321,-0.308580
1,-0.026992,-0.270032,-0.937314,-0.621436,-0.741061,0.267578,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,-1.858321,1.452761
2,0.019788,-0.459320,0.171132,1.106218,1.175876,0.452641,-0.150462,-0.348511,3.846517,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,-1.858321,-0.909807
3,-0.018648,-0.648608,-0.937314,1.106218,1.065284,0.360110,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,-1.858321,0.087722
4,-0.021682,-0.837895,-1.214426,-0.621436,-0.851654,0.360110,-0.150462,-0.348511,3.846517,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,-1.858321,0.315229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11358145,-0.003481,-0.459320,-0.383091,1.106218,0.438593,0.082515,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,1.752631,-0.443127
11358146,-0.021733,-1.027183,-0.244535,0.242391,0.139993,0.082515,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,1.752631,-1.618577
11358147,-0.030027,-0.837895,-1.352982,0.242391,-0.593604,0.236734,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,1.752631,1.452761
11358148,-0.012275,-0.270032,-0.937314,-0.621436,-0.630469,0.360110,-0.150462,-0.348511,-0.259975,-0.380956,-0.11943,-0.044735,-0.315404,-0.289108,1.752631,1.452761


**2 Балла**. Реализуйте класс KNNRegressor, который должен делать регрессию методом k ближайших соседей.

In [47]:
from scipy.spatial.distance import cdist

class KNNRegressor:
    def __init__(self, n_neighbors=5, metric='euclidean'):
        self.n_neighbors = n_neighbors
        self.metric = metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        distances = cdist(X, self.X_train, metric=self.metric)
        neighbors_idx = np.argsort(distances, axis=1)[: , :self.n_neighbors]
        neighbors_y = self.y_train[neighbors_idx]
        y_pred = np.mean(neighbors_y, axis=1)
        return y_pred

**3 Балла**. Реализуйте класс LinearRegression, поддерживающий обучение градиентными спусками SGD, Momentum, AdaGrad. Используйте градиент для оптимизации функции потерь MSE.

In [48]:
class LinearRegression:
    def __init__(self, learning_rate=0.01, optimization='SGD', epsilon=1e-8, decay_rate=0.9, max_iter=1000):
        self.learning_rate = learning_rate
        self.optimization = optimization
        self.epsilon = epsilon
        self.decay_rate = decay_rate
        self.max_iter = max_iter
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        if self.optimization == 'Momentum':
            velocity_w = np.zeros(n_features)
            velocity_b = 0
            beta = self.decay_rate
        elif self.optimization == 'AdaGrad':
            cache_w = np.zeros(n_features)
            cache_b = 0


        for _ in range(self.max_iter):
            y_pred = np.dot(X, self.weights) + self.bias

            error = y_pred - y
            grad_w = (2 / n_samples) * np.dot(X.T, error)
            grad_b = (2 / n_samples) * np.sum(error)

            if self.optimization == 'SGD':
                self.weights -= self.learning_rate * grad_w
                self.bias -= self.learning_rate * grad_b

            elif self.optimization == 'Momentum':
                velocity_w = beta * velocity_w + (1 - beta) * grad_w
                velocity_b = beta * velocity_b + (1 - beta) * grad_b
                self.weights -= self.learning_rate * velocity_w
                self.bias -= self.learning_rate * velocity_b

            elif self.optimization == 'AdaGrad':
                cache_w += grad_w ** 2
                cache_b += grad_b ** 2
                self.weights -= (self.learning_rate / (np.sqrt(cache_w) + self.epsilon)) * grad_w
                self.bias -= (self.learning_rate / (np.sqrt(cache_b) + self.epsilon)) * grad_b

            else:
                raise ValueError("Ничего кроме 'SGD', 'Momentum' и 'AdaGrad' не знаю.")

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias


# Часть 2. Эксперименты с моделями машинного обучения.

**3 Балла**. Проведите эксперименты с написанными Вами методами машинного обучения. Выделите обучающую и тестовую выборки в отношении 0,8 и 0,2 соответственно. Измерьте ошибку MSE, MAE, RMSE. Заиспользуйте методы KNNRegressor и LinearRegression из библиотеки sklearn, сравните качество Ваших решений и библиотечных.

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt


# Берем 1% данных чтобы не умирать из за попытки аллоцировать 150 террабайт озу
df_sample = df.sample(frac=0.01, random_state=14)

X = df_sample.drop(columns=['price']).values
y = df_sample['price'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=14
)

In [50]:
# Наш KNN
knn_custom = KNNRegressor(n_neighbors=5)
knn_custom.fit(X_train, y_train)

y_pred_knn_custom = knn_custom.predict(X_test)

mse_knn_custom = mean_squared_error(y_test, y_pred_knn_custom)
mae_knn_custom = mean_absolute_error(y_test, y_pred_knn_custom)
rmse_knn_custom = sqrt(mse_knn_custom)

print("Custom KNN:")
print(f"MSE: {mse_knn_custom}")
print(f"MAE: {mae_knn_custom}")
print(f"RMSE: {rmse_knn_custom}")

Custom KNN:
MSE: 0.006923055054419407
MAE: 0.012592825415944484
RMSE: 0.08320489801940392


In [51]:
# Забугорный KNN из sklearn
from sklearn.neighbors import KNeighborsRegressor

knn_sklearn = KNeighborsRegressor(n_neighbors=5)
knn_sklearn.fit(X_train, y_train)

y_pred_knn_sklearn = knn_sklearn.predict(X_test)

mse_knn_sklearn = mean_squared_error(y_test, y_pred_knn_sklearn)
mae_knn_sklearn = mean_absolute_error(y_test, y_pred_knn_sklearn)
rmse_knn_sklearn = sqrt(mse_knn_sklearn)

print("Sklearn KNN:")
print(f"MSE: {mse_knn_sklearn}")
print(f"MAE: {mae_knn_sklearn}")
print(f"RMSE: {rmse_knn_sklearn}")

Sklearn KNN:
MSE: 0.006923040447986779
MAE: 0.012592314462630663
RMSE: 0.08320481024548268


In [52]:
# Наш LinearRegression

lr_custom = LinearRegression(learning_rate=0.01, optimization='SGD', max_iter=1000)
lr_custom.fit(X_train, y_train)

y_pred_lr_custom = lr_custom.predict(X_test)

mse_lr_custom = mean_squared_error(y_test, y_pred_lr_custom)
mae_lr_custom = mean_absolute_error(y_test, y_pred_lr_custom)
rmse_lr_custom = sqrt(mse_lr_custom)

print("Custom LinearRegression:")
print(f"MSE: {mse_lr_custom}")
print(f"MAE: {mae_lr_custom}")
print(f"RMSE: {rmse_lr_custom}")

Custom LinearRegression:
MSE: 0.002428515714464662
MAE: 0.01879524312186193
RMSE: 0.04927997275227191


In [53]:
# Забугорный LinearRegression из sklearn

from sklearn.linear_model import LinearRegression as SklearnLinearRegression

lr_sklearn = SklearnLinearRegression()
lr_sklearn.fit(X_train, y_train)

y_pred_lr_sklearn = lr_sklearn.predict(X_test)

mse_lr_sklearn = mean_squared_error(y_test, y_pred_lr_sklearn)
mae_lr_sklearn = mean_absolute_error(y_test, y_pred_lr_sklearn)
rmse_lr_sklearn = sqrt(mse_lr_sklearn)

print("Sklearn LinearRegression")
print(f"MSE: {mse_lr_sklearn}")
print(f"MAE: {mae_lr_sklearn}")
print(f"RMSE: {rmse_lr_sklearn}")

Sklearn LinearRegression
MSE: 0.0024286818098406644
MAE: 0.018808192242371794
RMSE: 0.04928165794533159


In [54]:
%pip install tabulate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\vanya\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [55]:
from tabulate import tabulate


table = [
    ["Custom KNN", mse_knn_custom, mae_knn_custom, rmse_knn_custom],
    ["Sklearn KNN", mse_knn_sklearn, mae_knn_sklearn, rmse_knn_sklearn],
    ["Custom LinearReg", mse_lr_custom, mae_lr_custom, rmse_lr_custom],
    ["Sklearn LinearReg", mse_lr_sklearn, mae_lr_sklearn, rmse_lr_sklearn],
]

headers = ["Модель", "MSE", "MAE", "RMSE"]

print(tabulate(table, headers, floatfmt=".6f", tablefmt="github"))

| Модель            |      MSE |      MAE |     RMSE |
|-------------------|----------|----------|----------|
| Custom KNN        | 0.006923 | 0.012593 | 0.083205 |
| Sklearn KNN       | 0.006923 | 0.012592 | 0.083205 |
| Custom LinearReg  | 0.002429 | 0.018795 | 0.049280 |
| Sklearn LinearReg | 0.002429 | 0.018808 | 0.049282 |


# ⚠️ Внимание!

Спасибо за внимание