### **Часть 0: ПОДГОТОВКА**

Добавляем библиотеки

In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib as plt
import sklearn
import seaborn as sns

Далее добавляем библиотеку folium для отображения данных на карте по координатам.

In [None]:
!pip install folium



Загрузка архива, проверка и распаковка

In [None]:
!ls

archive.zip  sample_data


In [None]:
!unzip archive.zip

Archive:  archive.zip
  inflating: input_data.csv          


In [None]:
!ls

archive.zip  input_data.csv  sample_data


Далее загрузим данные из csv файла в датафрейм.

In [2]:
import pandas as pd
df = pd.read_csv('input_data.csv', sep=";")
df.head()

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,geo_lat,geo_lon,building_type,object_type,postal_code,street_id,id_region,house_id
0,2021-01-01,2451300,15,31,1,30.3,0.0,56.780112,60.699355,0,2,620000.0,,66,1632918.0
1,2021-01-01,1450000,5,5,1,33.0,6.0,44.608154,40.138381,0,0,385000.0,,1,
2,2021-01-01,10700000,4,13,3,85.0,12.0,55.54006,37.725112,3,0,142701.0,242543.0,50,681306.0
3,2021-01-01,3100000,3,5,3,82.0,9.0,44.608154,40.138381,0,0,385000.0,,1,
4,2021-01-01,2500000,2,3,1,30.0,9.0,44.738685,37.713668,3,2,353960.0,439378.0,23,1730985.0


Далее проверяем пропуски и и смотрим краткую информацию о данных

In [None]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11358150 entries, 0 to 11358149
Data columns (total 15 columns):
 #   Column         Dtype  
---  ------         -----  
 0   date           object 
 1   price          int64  
 2   level          int64  
 3   levels         int64  
 4   rooms          int64  
 5   area           float64
 6   kitchen_area   float64
 7   geo_lat        float64
 8   geo_lon        float64
 9   building_type  int64  
 10  object_type    int64  
 11  postal_code    float64
 12  street_id      float64
 13  id_region      int64  
 14  house_id       float64
dtypes: float64(7), int64(7), object(1)
memory usage: 1.3+ GB


Unnamed: 0,0
date,0
price,0
level,0
levels,0
rooms,0
area,0
kitchen_area,0
geo_lat,0
geo_lon,0
building_type,0


Отобразим на карте координаты наших построек.

In [None]:
import folium
from IPython.display import display

map_df = df.loc[:1000]

m = folium.Map(location=[55.751244, 37.618423], zoom_start=10)

# Список точек с широтой и долготой
lats = map_df['geo_lat'].loc[:1000]
longs = map_df['geo_lon'].loc[:1000]
# Добавляем точки на карту
for point in zip(lats, longs):
    folium.Marker(
        location=[point[0], point[1]]
    ).add_to(m)

display(m)

### **Часть 1:Подготовим данные для обработки моделями машинного обучения.**

1. Создадим функцию которая будет определять на сколько далеко находятся наши квартиры от центра.

In [3]:
import numpy as np

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1_rad, lon1_rad = np.radians(lat1), np.radians(lon1)
    lat2_rad, lon2_rad = np.radians(lat2), np.radians(lon2)

    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    distance = R * c
    return distance

In [4]:
moscow_center = (55.751244, 37.618423)
spb_center = (59.934280, 30.335099)

distances_to_moscow = haversine_distance(df['geo_lat'], df['geo_lon'], moscow_center[0], moscow_center[1])
distances_to_spb = haversine_distance(df['geo_lat'], df['geo_lon'], spb_center[0], spb_center[1])

# Создание признаков
df['is_Moscow'] = (distances_to_moscow <= 20).astype(int)
df['is_Saint_Peterburg'] = (distances_to_spb <= 20).astype(int)

df[['geo_lat', 'geo_lon', 'is_Moscow', 'is_Saint_Peterburg']].head(20)

Unnamed: 0,geo_lat,geo_lon,is_Moscow,is_Saint_Peterburg
0,56.780112,60.699355,0,0
1,44.608154,40.138381,0,0
2,55.54006,37.725112,0,0
3,44.608154,40.138381,0,0
4,44.738685,37.713668,0,0
5,48.511172,44.566846,0,0
6,55.009914,82.934859,0,0
7,51.834703,107.600571,0,0
8,45.003869,39.086511,0,0
9,53.164362,45.033956,0,0


2. Теперь удаляем ненужные признаки geo_lat, geo_lon, object_type, postal_code, street_id, id_region, house_id.

In [5]:
columns_to_drop = ['geo_lat', 'geo_lon', 'object_type', 'postal_code', 'street_id', 'id_region', 'house_id']
df = df.drop(columns=columns_to_drop)


In [6]:
df.head()

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,building_type,is_Moscow,is_Saint_Peterburg
0,2021-01-01,2451300,15,31,1,30.3,0.0,0,0,0
1,2021-01-01,1450000,5,5,1,33.0,6.0,0,0,0
2,2021-01-01,10700000,4,13,3,85.0,12.0,3,0,0
3,2021-01-01,3100000,3,5,3,82.0,9.0,0,0,0
4,2021-01-01,2500000,2,3,1,30.0,9.0,3,0,0


3. Теперь анализируем то что осталось, определяем какие колонки категориальные и числовые. И закодируем категориальные признаки с помощью OneHot-Encoding.

In [19]:

categorical_columns = ['building_type', 'is_Moscow', 'is_Saint_Peterburg']
numerical_columns = ['price', 'level', 'levels', 'rooms', 'area', 'kitchen_area', 'date']

print("Категориальные колонки:", categorical_columns)
print("Числовые колонки:", numerical_columns)

Категориальные колонки: ['building_type', 'is_Moscow', 'is_Saint_Peterburg']
Числовые колонки: ['price', 'level', 'levels', 'rooms', 'area', 'kitchen_area', 'date']


In [7]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['building_type', 'is_Moscow', 'is_Saint_Peterburg']
numerical_columns = ['price', 'level', 'levels', 'rooms', 'area', 'kitchen_area', 'date']

encoder = OneHotEncoder(drop='first')

encoded_features = encoder.fit_transform(df[categorical_columns]).toarray()  # Преобразование в массив NumPy

encoded_columns = encoder.get_feature_names_out(categorical_columns)

encoded_df = pd.DataFrame(encoded_features, columns=encoded_columns)
df = df.drop(columns=categorical_columns)

df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

print("Результат One-Hot Encoding:")
print(df.head())


Результат One-Hot Encoding:
         date     price  level  levels  rooms  area  kitchen_area  \
0  2021-01-01   2451300     15      31      1  30.3           0.0   
1  2021-01-01   1450000      5       5      1  33.0           6.0   
2  2021-01-01  10700000      4      13      3  85.0          12.0   
3  2021-01-01   3100000      3       5      3  82.0           9.0   
4  2021-01-01   2500000      2       3      1  30.0           9.0   

   building_type_1  building_type_2  building_type_3  building_type_4  \
0              0.0              0.0              0.0              0.0   
1              0.0              0.0              0.0              0.0   
2              0.0              0.0              1.0              0.0   
3              0.0              0.0              0.0              0.0   
4              0.0              0.0              1.0              0.0   

   building_type_5  building_type_6  is_Moscow_1  is_Saint_Peterburg_1  
0              0.0              0.0          

In [8]:
df.head()

Unnamed: 0,date,price,level,levels,rooms,area,kitchen_area,building_type_1,building_type_2,building_type_3,building_type_4,building_type_5,building_type_6,is_Moscow_1,is_Saint_Peterburg_1
0,2021-01-01,2451300,15,31,1,30.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-01-01,1450000,5,5,1,33.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-01-01,10700000,4,13,3,85.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2021-01-01,3100000,3,5,3,82.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-01-01,2500000,2,3,1,30.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


4. Поработаем с числовыми признаками:

Добавьте в ваш датасет два признака: количество дней со дня первого наблюдения (разница между датами объявлений). Возможно, для предсказания цены не так важен этаж, как важно отношение этажа квартиры на количество этажей в доме, добавьте этот признак. После добавления нового признака колонку date можно удалить.
Числовые признаки могут иметь разные порядки. Давайте отнормируем числовые признаки с помощью StandartScaller https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html.

In [9]:
from sklearn.preprocessing import StandardScaler
# Преобразование колонки 'date' в тип datetime
df['date'] = pd.to_datetime(df['date'])

first_date = df['date'].min()  # Находим минимальную дату
df['days_after_checkpoint'] = (df['date'] - first_date).dt.days

df['floor_ratio'] = df['level'] / df['levels']

df.drop('date', axis=1, inplace=True)


In [None]:
df.head()


Unnamed: 0,price,level,levels,rooms,area,kitchen_area,building_type_1,building_type_2,building_type_3,building_type_4,building_type_5,building_type_6,is_Moscow_1,is_Saint_Peterburg_1,days_since_first_observation,floor_ratio
0,2451300,15,31,1,30.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.483871
1,1450000,5,5,1,33.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0
2,10700000,4,13,3,85.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.307692
3,3100000,3,5,3,82.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.6
4,2500000,2,3,1,30.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.666667


5. Реализуйте класс KNNRegressor, который должен делать регрессию методом k ближайших соседей.

In [12]:
import numpy as np
from sklearn.metrics import pairwise_distances

class KNNRegressor:
    def __init__(self, n_neighbors=5, distance_metric='euclidean'):
        """
        Инициализация KNN регрессора.

        :param n_neighbors: Количество ближайших соседей для использования в предсказании.
        :param distance_metric: Метрика расстояния для вычисления расстояний.
        """
        self.n_neighbors = n_neighbors
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, features, target):
        """
        Обучение модели на предоставленных данных.

        :param features: Признаки, используемые для обучения модели.
        :param target: Целевые значения, которые нужно предсказать.
        """
        self.X_train = features
        self.y_train = target

    def predict(self, new_data):
        """
        Прогнозирование целевых значений для новых данных.

        :param new_data: Новые признаки для предсказания.
        :return: Предсказанные значения.
        """
        # Вычисляем расстояния между новыми данными и обучающим набором
        distances = pairwise_distances(new_data, self.X_train, metric=self.distance_metric)

        # Находим индексы ближайших соседей
        nearest_indices = np.argsort(distances, axis=1)[:, :self.n_neighbors]

        # Извлекаем значения целевой переменной ближайших соседей
        nearest_values = self.y_train[nearest_indices]

        # Возвращаем среднее значение среди соседей как предсказание
        return np.mean(nearest_values, axis=1)


3 Балла. Реализуйте класс LinearRegression, поддерживающий обучение градиентными спусками SGD, Momentum, AdaGrad. Используйте градиент для оптимизации функции потерь MSE.