In [5]:
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации

from sklearn import linear_model #линейные модели
from sklearn import metrics #метрики
from sklearn import preprocessing #предобработка
from sklearn.model_selection import train_test_split #разделение выборки

In [25]:
data = pd.read_csv('data/insurance.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [26]:
#кодируем бинарные категориальные признаки
data['smoker'] = data['smoker'].apply(lambda x: 0 if x == 'no' else 1)
data['sex'] = data['sex'].apply(lambda x: 0 if x == 'female' else 1)
#оставшиеся категориальные признаки кодируем с помощью OneHot
data = pd.get_dummies(data)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,False,False,False,True
1,18,1,33.77,1,0,1725.5523,False,False,True,False
2,28,1,33.0,3,0,4449.462,False,False,True,False
3,33,1,22.705,0,0,21984.47061,False,True,False,False
4,32,1,28.88,0,0,3866.8552,False,True,False,False


In [27]:
features = data.drop('charges', axis=1).columns
X, y = data[features], data['charges']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
print('Train:', X_train.shape, y_train.shape)
print('Test:', X_test.shape, y_test.shape)

Train: (1070, 9) (1070,)
Test: (268, 9) (268,)


In [29]:
lr_lstat = linear_model.LinearRegression()
lr_lstat.fit(X_train, y_train)
 
print('w0: {}'.format(lr_lstat.intercept_)) #свободный член w0
print('w1: {}'.format(lr_lstat.coef_)) #остальные параметры модели w1, w2, ..., wm

y_train_predict = lr_lstat.predict(X_train)
y_test_predict = lr_lstat.predict(X_test)

w0: -12198.740769442573
w1: [  254.74380593  -109.93652974   326.37904839   583.2708384
 23771.28324974   536.70936675    47.11043104  -261.25319814
  -322.56659965]


In [30]:
#Рассчитываем MAE
print('MAE score: {:.3f} thou. $'.format(metrics.mean_absolute_error(y_train, y_train_predict)))
#Рассчитываем MAPE
print('MAPE score: {:.3f} %'.format(metrics.mean_absolute_percentage_error(y_train, y_train_predict) * 100))
#Рассчитываем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_train, y_train_predict)))

MAE score: 4103.695 thou. $
MAPE score: 40.747 %
R2 score: 0.755


In [31]:
#Рассчитываем MAE
print('MAE score: {:.3f} thou. $'.format(metrics.mean_absolute_error(y_test, y_test_predict)))
#Рассчитываем MAPE
print('MAPE score: {:.3f} %'.format(metrics.mean_absolute_percentage_error(y_test, y_test_predict) * 100))
#Рассчитываем коэффициент детерминации
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test, y_test_predict)))

MAE score: 4358.281 thou. $
MAPE score: 44.263 %
R2 score: 0.734


In [32]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
poly.fit(X_train_scaled)
X_train_scaled_poly = poly.transform(X_train_scaled)
X_test_scaled_poly = poly.transform(X_test_scaled)
print(X_train_scaled_poly.shape)

(1070, 54)


In [34]:
lr_lstat = linear_model.LinearRegression()
lr_lstat.fit(X_train_scaled_poly, y_train)
y_train_predict_poly = lr_lstat.predict(X_train_scaled_poly)
y_test_predict_poly = lr_lstat.predict(X_test_scaled_poly)

In [35]:
print('R2 score: {:.3f}'.format(metrics.r2_score(y_test, y_test_predict_poly)))

R2 score: 0.847


In [36]:
lr_lstat.coef_

array([ 1.08172430e+14,  1.55580550e+17, -6.52226617e+15, -4.29119480e+16,
        7.18584476e+15,  1.53877675e+15,  5.62018978e+16, -1.41735832e+16,
        1.09296760e+16,  7.41987891e+03, -1.83250000e+02,  2.62510938e+03,
       -1.24517969e+03, -2.16093750e+02, -1.08172430e+14, -1.08172430e+14,
       -1.08172430e+14, -1.08172430e+14, -1.45841119e+17,  1.76557617e+02,
       -1.58572168e+03,  3.76708984e+02, -9.73943150e+15, -9.73943150e+15,
       -9.73943150e+15, -9.73943150e+15, -9.47328625e+03, -6.27657043e+02,
        5.39794662e+04,  6.52226617e+15,  6.52226617e+15,  6.52226617e+15,
        6.52226617e+15, -1.26400000e+03, -6.80000000e+02,  4.29119480e+16,
        4.29119480e+16,  4.29119480e+16,  4.29119480e+16, -9.66470108e+15,
        2.47885631e+15,  2.47885631e+15,  2.47885631e+15,  2.47885631e+15,
       -5.90095120e+15,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -6.05640722e+16,  0.00000000e+00,  0.00000000e+00,  9.81140879e+15,
        0.00000000e+00, -

In [40]:
lasso_lr_poly = linear_model.Ridge()
lasso_lr_poly.fit(X_train_scaled_poly, y_train)
y_test_predict_poly = lasso_lr_poly.predict(X_test_scaled_poly)
print("Test R^2: {:.3f}".format(metrics.r2_score(y_test, y_test_predict_poly)))
print('MAE score: {:.3f} thou. $'.format(metrics.mean_absolute_error(y_test, y_test_predict_poly)))
print('MAPE score: {:.3f} %'.format(metrics.mean_absolute_percentage_error(y_test, y_test_predict_poly) * 100))

Test R^2: 0.840
MAE score: 2964.869 thou. $
MAPE score: 31.093 %
