In [40]:
import pandas as pd
import os

raw_dir = '../data/raw'
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

for file in os.listdir(raw_dir):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(raw_dir, file))
        df = df.dropna()

        # Добавляем столбец 'url_id' с уникальными идентификаторами, если его нет
        df['url_id'] = df.index

        columns_to_drop = ['author', 'commissions', 'author_type', 'url', 'location', 
                           'house_number', 'street', 'residential_complex', 'ID', 
                           'price_per_month', 'comissions', 'accommodation_type', 
                           'deal_type', 'underground', 'residential_complex', 'district']
        df = df.drop(columns=columns_to_drop, errors='ignore')

        df = df.dropna(subset=['price', 'total_meters'])
        df = df[(df['price'] > 100_000) & (df['price'] < 100_000_000)]
        df = df[(df['total_meters'] > 10) & (df['total_meters'] < 100)]

        processed_file = os.path.join(processed_dir, f"processed_{file}")
        df.to_csv(processed_file, index=False)
        print(f"Processed data saved to {processed_file}")

df['url_id'] = df.index

Processed data saved to ../data/processed/processed_raw_2025-05-12_21-11.csv
Processed data saved to ../data/processed/processed_1_2025-04-17_20-03.csv
Processed data saved to ../data/processed/processed_1_2025-05-12_19-47.csv
Processed data saved to ../data/processed/processed_raw_2025-05-12_21-09.csv


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import os


processed_dir = '../data/processed'


dataframes = [pd.read_csv(os.path.join(processed_dir, f)) for f in os.listdir(processed_dir) if f.endswith('.csv')]

df = pd.concat(dataframes, ignore_index=True).fillna(False)

if 'price' not in df.columns:
    raise ValueError("Column 'price' is missing in the processed data.")

y = df['price']
X = df.drop(columns=['price'])

X = pd.get_dummies(X)

X, y = X.align(y, join='inner', axis=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

model_path = '../models/linear_model.pkl'
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")


Mean Squared Error: 284160017030807.69
Model saved to ../models/linear_model.pkl


In [41]:
from math import radians, cos, sin, asin


def distance_haversine(point_1: tuple, point_2: tuple):
    d_earth = 2.0 * 6372.8
    lat1, long1 = tuple(radians(c) for c in point_1)
    lat2, long2 = tuple(radians(c) for c in point_2)
    d = sin((lat2 - lat1) / 2.0) ** 2.0 + cos(lat1) * cos(lat2) * sin(
        (long2 - long1) / 2.0) ** 2.0
    return d_earth * asin(d ** 0.5)


def find_nearest(point_1: tuple, points: dict):
    dists = {p: distance_haversine(point_1, points[p]) for p in points}
    name, dist = min(dists.items(), key=lambda d: d[1])
    return {'name': name, 'distance': dist,
            'dist_coef': 3 if dist <= 1.0 else 2 if dist < 2.0 else 1}


metro_points = {
    'Новокосино': (55.745113, 37.864052),
    'Перово': (55.75098, 37.78422),
    'Ховрино': (55.8777, 37.4877),
    }

point_1 = (55.741298984107324, 37.415756143334846)
print(find_nearest(point_1, metro_points))
# {'name': 'Ховрино', 'distance': 15.823760672698684, 'dist_coef': 1}

{'name': 'Ховрино', 'distance': 15.823760672698684, 'dist_coef': 1}
