In [5]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression

df = pd.read_csv('huh.csv')

Вначале вновь проведем label encoding

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for c in df.columns:
    df[c] = le.fit_transform(df[c])
df.head()

Unnamed: 0.1,Unnamed: 0,id,hour,day,month,source,destination,name,price,distance,...,icon,dewPoint,pressure,windBearing,cloudCover,uvIndex,ozone,moonPhase,precipIntensityMax,temperatureMin
0,0,165097,9,9,1,5,7,7,4,15,...,5,173,230,21,58,0,131,5,52,113
1,1,188776,2,13,0,5,7,2,20,15,...,6,258,98,40,82,0,69,9,53,118
2,2,378853,1,14,0,5,7,5,8,15,...,1,149,28,113,3,0,157,10,36,78
3,3,485300,4,16,0,5,7,4,53,15,...,1,61,158,163,0,0,69,12,0,67
4,4,558249,3,15,0,5,7,6,14,15,...,5,115,39,156,37,0,237,11,1,51


Повторим прошлые операции для анализа успешности оптимизации гиперпараметров

In [7]:
X_to_split = df.drop(['price','surge_multiplier'], inplace=False, axis=1)
Y_to_split = df['price']

X_train,  X_test, Y_train, Y_test = train_test_split(X_to_split, Y_to_split, random_state=100)

dummy1 = DummyRegressor(strategy='mean')
dummy2 = DummyRegressor(strategy='median')

lr = LinearRegression()

dummy1.fit(X_train, Y_train)
dummy2.fit(X_train, Y_train)
lr.fit(X_train, Y_train)

In [8]:
from sklearn.metrics import mean_squared_error

y_preds_dummy1 = dummy1.predict(X_test)
print(f'RMSE Dummy1 : {mean_squared_error(Y_test, y_preds_dummy1, squared=True)}')
y_preds_dummy2 = dummy2.predict(X_test)
print(f'RMSE Dummy2 : {mean_squared_error(Y_test, y_preds_dummy2, squared=True)}')
y_preds_lr = lr.predict(X_test)
print(f'RMSE Linear : {mean_squared_error(Y_test, y_preds_lr, squared=True)}')

RMSE Dummy1 : 443.5180559696383
RMSE Dummy2 : 473.85158689355086
RMSE Linear : 232.16233744815963


Для начала проведем рескейлинг и оптимизацию гиперпараметров линейной регрессии

In [9]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", LinearRegression()),
    ]
)

model = pipeline.fit(X_train, Y_train)

parameters = {
    'scaler__with_mean': [True, False],
    'clf__copy_X': [True, False],
    'clf__fit_intercept': [True, False],
    'clf__n_jobs': np.arange(0, 10),
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=parameters,
    n_jobs=2,
    verbose=1,
)

grid_search.fit(X_train, Y_train)

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

y_preds_lr_opt = grid_search.predict(X_test)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
clf__copy_X: True
clf__fit_intercept: True
clf__n_jobs: 0
scaler__with_mean: True


In [10]:
y_preds_lr_opt = grid_search.predict(X_test)
print(f'RMSE Linear : {mean_squared_error(Y_test, y_preds_lr_opt, squared=True)}')

RMSE Linear : 232.16233744815958


In [11]:
pipelineDummy = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", dummy2),
    ]
)

modelDummy = pipelineDummy.fit(X_train, Y_train)

parametersDummy = {
    'scaler__with_mean': [True, False],
}

grid_searchDummy = GridSearchCV(
    estimator=pipelineDummy,
    param_grid=parametersDummy,
    n_jobs=2,
    verbose=1,
)

grid_searchDummy.fit(X_train, Y_train)

y_preds_dummy1_opt = grid_searchDummy.predict(X_test)
print(f'RMSE Dummy : {mean_squared_error(Y_test, y_preds_dummy1_opt, squared=True)}')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
RMSE Dummy : 473.85158689355086


При попытке оптимизации и рескейлинге мы получили изменение абсолютной величины ошибки (причем в большую сторону), но отношение ошибок между ошибкой бейслайна и ошибки линейной регрессией сохраняется.

Попробуем другие методы регрессии

In [12]:
from sklearn.neighbors import KNeighborsRegressor

nr = KNeighborsRegressor()

pipelineNR = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", nr),
    ]
)

modelNR = pipelineNR.fit(X_train, Y_train)

parametersNR = {
    'clf__n_neighbors': [5,3,7],
    'clf__weights': ['uniform', 'distance']
}

grid_searchNR = GridSearchCV(
    estimator=pipelineNR,
    param_grid=parametersNR,
    n_jobs=-1,
    verbose=1,
)

grid_searchNR.fit(X_train, Y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


KeyError: 'clf__copy_X'

In [13]:
best_parametersNR = grid_searchNR.best_estimator_.get_params()
for param_name in sorted(parametersNR.keys()):
    print(f"{param_name}: {best_parametersNR[param_name]}")


y_preds_NR_opt = grid_searchNR.predict(X_test)
print(f'RMSE Neighbors : {mean_squared_error(Y_test, y_preds_NR_opt, squared=True)}')

clf__n_neighbors: 7
clf__weights: uniform
RMSE Neighbors : 211.306968236026


Лучшие результаты (211.3) показала модель с параметром n_neighbors = 7 и weights = 'uniform'. Попробуем еще раз увеличить количество соседей

In [14]:
nr = KNeighborsRegressor()

pipelineNR = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("clf", nr),
    ]
)

modelNR = pipelineNR.fit(X_train, Y_train)

parametersNR = {
    'clf__n_neighbors': [10],
    'clf__weights': ['uniform']
}

grid_searchNR = GridSearchCV(
    estimator=pipelineNR,
    param_grid=parametersNR,
    n_jobs=-1,
    verbose=1,
)

grid_searchNR.fit(X_train, Y_train)

best_parametersNR = grid_searchNR.best_estimator_.get_params()
for param_name in sorted(parametersNR.keys()):
    print(f"{param_name}: {best_parametersNR[param_name]}")


y_preds_NR_opt = grid_searchNR.predict(X_test)
print(f'RMSE Neighbors : {mean_squared_error(Y_test, y_preds_NR_opt, squared=True)}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
clf__n_neighbors: 10
clf__weights: uniform
RMSE Neighbors : 204.5676245501398


Увеличение количества соседей позволило еще немного сократить ошибку (204.5)

Возможно, одной из причин сохранения ошибки (значения около 200) связано с неправильной обработкой категориальных данных (возможно, вместо label encoding стоило просто отбросить такие столбцы)