In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

from sklearn.metrics import (mean_absolute_error, r2_score, mean_squared_error)

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

  from pandas.core import (


In [2]:
df = pd.read_csv('data/sales.csv')
df.head()

Unnamed: 0,SalePrice,GrLivArea,GarageArea,OverallQual,Street,SaleCondition
0,208500,1710,548,7,Pave,Normal
1,181500,1262,460,6,Pave,Normal
2,223500,1786,608,7,Pave,Normal
3,140000,1717,642,7,Pave,Abnorml
4,250000,2198,836,8,Pave,Normal


### Урок 2. Регрессия на практике

### Пусть у нас имеется некоторый набор данных sales.csv о продажах квартир.
* цена продажи (SalePrice, зависимая переменная),
* наземная жилая зона (GrLivArea),
* общее качество (Overall Quality),
* территория гаража (GarageArea),
* условия сделки (SaleCondition).
* Реализуйте построение модели линейной регрессии на этих данных. Проверьте качество модели на обучающей и тестовой выборках с помощью MAE, MSE.

In [3]:
df['Street'].value_counts()

Street
Pave    1454
Grvl       6
Name: count, dtype: int64

In [4]:
df['SaleCondition'].value_counts()

SaleCondition
Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: count, dtype: int64

In [5]:
# обработаем категориальные признаки 'Street', 'SaleCondition'
for col in df[['Street', 'SaleCondition']]:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)

In [6]:
df.head()

Unnamed: 0,SalePrice,GrLivArea,GarageArea,OverallQual,Street,SaleCondition,Street_Grvl,Street_Pave,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,208500,1710,548,7,Pave,Normal,False,True,False,False,False,False,True,False
1,181500,1262,460,6,Pave,Normal,False,True,False,False,False,False,True,False
2,223500,1786,608,7,Pave,Normal,False,True,False,False,False,False,True,False
3,140000,1717,642,7,Pave,Abnorml,False,True,True,False,False,False,False,False
4,250000,2198,836,8,Pave,Normal,False,True,False,False,False,False,True,False


### Построение модели

In [7]:
select_features = df.drop(['Street', 'SaleCondition'], axis=1)
features_scaled = StandardScaler().fit_transform(select_features)

In [8]:
df_sales = pd.DataFrame(features_scaled, columns=select_features.columns)

In [9]:
df_sales.head()

Unnamed: 0,SalePrice,GrLivArea,GarageArea,OverallQual,Street_Grvl,Street_Pave,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.347273,0.370333,0.351,0.651479,-0.064238,0.064238,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
1,0.007288,-0.482512,-0.060731,-0.071836,-0.064238,0.064238,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
2,0.536154,0.515013,0.631726,0.651479,-0.064238,0.064238,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995
3,-0.515281,0.383659,0.790804,0.651479,-0.064238,0.064238,3.668167,-0.052414,-0.091035,-0.117851,-2.138345,-0.305995
4,0.869843,1.299326,1.698485,1.374795,-0.064238,0.064238,-0.272616,-0.052414,-0.091035,-0.117851,0.467651,-0.305995


In [10]:
X = df_sales.drop(['SalePrice'], axis=1)
Y = df_sales['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
model_lr = LinearRegression().fit(X_train, y_train)

train_pred = model_lr.predict(X_train)

print('TRAIN')
print(f'MSE: {mean_squared_error(y_train, train_pred)}')
print(f'MAE: {mean_absolute_error(y_train, train_pred)}')
print(f'R2: {r2_score(y_train, train_pred)}')

TRAIN
MSE: 0.24579056495437268
MAE: 0.32966233268212247
R2: 0.7401062895372108


In [12]:
test_pred = model_lr.predict(X_test)

print('TEST')
print(f'MSE: {mean_squared_error(y_test, test_pred)}')
print(f'MAE: {mean_absolute_error(y_test, test_pred)}')
print(f'R2: {r2_score(y_test, test_pred)}')

TEST
MSE: 0.2799408084781353
MAE: 0.3432518297232531
R2: 0.7698236753178695
