Данные: технические характеристики разных конфигураций автомобилей

Таргет: длина колесной базы конфигурации (в миллиметрах).

Задача: добиться наименьшего MSE на test.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LassoCV, RidgeCV

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/katarina74/ml_lessons/main/lesson_2/data/techparams_train.csv")
df

Unnamed: 0,index,back-suspension,battery-capacity,charge-time,compression,consumption-mixed,cylinders-order,cylinders-value,engine-feeding,engine-start,...,configurations_front-brake,configurations_safety-rating,configurations_seats,configurations_tank-volume,supergen_year-stop,models_country-from,models_group,models_light-and-commercial,models_male,target
0,0,9,-1.0,36457,9.0,4.3,0,3,4,2006,...,1,2,13,40.0,2018.0,16,3,0,1,2360
1,2,3,-1.0,44872,8.0,-1.0,3,7,4,1982,...,4,2,13,108.0,1993.0,34,3,0,1,3060
2,4,3,-1.0,55927,16.0,4.2,0,4,5,2014,...,4,2,13,55.0,2019.0,35,3,0,1,2648
3,5,0,-1.0,41405,10.3,-1.0,0,4,4,2000,...,4,2,13,55.0,2003.0,10,3,0,1,2513
4,7,8,-1.0,22523,19.0,-1.0,0,4,8,2000,...,4,1,13,62.0,2005.0,10,3,0,1,2703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43240,61224,9,-1.0,34558,11.2,6.6,0,4,4,2004,...,1,2,13,47.0,2016.0,15,3,0,1,2510
43241,61225,3,-1.0,38268,9.6,5.8,0,4,2,2013,...,4,2,13,65.0,2015.0,10,3,0,1,2808
43242,61226,3,-1.0,8958,10.0,9.3,0,4,4,1991,...,4,2,13,55.0,1999.0,16,3,0,1,2520
43243,61227,3,-1.0,20293,8.5,-1.0,0,5,4,2002,...,1,2,13,70.0,2004.0,28,3,0,1,2755


In [3]:
df = df.drop(columns='index') # можно убрать, тк дублирует автоматичекие индексы

#### Linear regression

In [4]:
# попробуем через linear regression

X = df.drop(["target"], axis=1)
y = df["target"] 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

reg = LinearRegression().fit(train_X, train_y)

In [5]:
print(f'Train MSE: {mean_squared_error(reg.predict(train_X), train_y)};\nTest MSE: {mean_squared_error(reg.predict(test_X), test_y)}.')

Train MSE: 29843.04207439402;
Test MSE: 31015.75310911948.


#### Через MinMaxScaler

In [6]:
X = df.drop(["target"], axis=1)
y = df["target"] 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = MinMaxScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

reg = LinearRegression().fit(train_X_scaled, train_y)

In [7]:
print(f'Train MSE: {mean_squared_error(train_y, reg.predict(train_X_scaled))};\nTest MSE: {mean_squared_error(test_y, reg.predict(test_X_scaled))}.')

Train MSE: 29843.042074394023;
Test MSE: 31015.753109119363.


#### Через StandardScaler

In [8]:
X = df.drop(["target"], axis=1)
y = df["target"] 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

reg = LinearRegression().fit(train_X_scaled, train_y)

In [9]:
print(f'Train MSE: {mean_squared_error(train_y, reg.predict(train_X_scaled))};\nTest MSE: {mean_squared_error(test_y, reg.predict(test_X_scaled))}.')

Train MSE: 29843.042074394023;
Test MSE: 31015.75310911936.


#### Через RobustScaler

In [10]:
X = df.drop(["target"], axis=1)
y = df["target"] 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = RobustScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

reg = LinearRegression().fit(train_X_scaled, train_y)

In [11]:
print(f'Train MSE: {mean_squared_error(train_y, reg.predict(train_X_scaled))};\nTest MSE: {mean_squared_error(test_y, reg.predict(test_X_scaled))}.')

Train MSE: 29843.042074394023;
Test MSE: 31015.753109119352.


#### Lasso

In [12]:
X = df.drop(["target"], axis=1)
y = df["target"] 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)
reg = Lasso().fit(train_X_scaled, train_y)

In [13]:
print(f'Train MSE: {mean_squared_error(train_y, reg.predict(train_X_scaled))};\nTest MSE: {mean_squared_error(test_y, reg.predict(test_X_scaled))}.')

Train MSE: 29883.97039909283;
Test MSE: 31070.93628571238.


#### Ridge

In [14]:
X = df.drop(["target"], axis=1)
y = df["target"] 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)
reg = Ridge().fit(train_X_scaled, train_y)

In [15]:
print(f'Train MSE: {mean_squared_error(train_y, reg.predict(train_X_scaled))};\nTest MSE: {mean_squared_error(test_y, reg.predict(test_X_scaled))}.')

Train MSE: 29843.04245888063;
Test MSE: 31015.77143866261.


Лучше всего вышел результат при RobustScaler: в данном методе Test MSE получился наименьшим из проделанных способов - 31015.75109119352.