In [35]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Подготовка данных

In [2]:
df = pd.read_csv('data/students_preprocessed.csv', sep = ',')
students_df = df[list(('school', 'sex', 'age', 'Pstatus', 'studytime', \
                                  'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'higher', \
                                  'internet', 'absences', 'G1', 'G2', 'G3'))]

In [3]:
students_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      395 non-null    int64
 1   sex         395 non-null    int64
 2   age         395 non-null    int64
 3   Pstatus     395 non-null    int64
 4   studytime   395 non-null    int64
 5   failures    395 non-null    int64
 6   schoolsup   395 non-null    int64
 7   famsup      395 non-null    int64
 8   paid        395 non-null    int64
 9   activities  395 non-null    int64
 10  higher      395 non-null    int64
 11  internet    395 non-null    int64
 12  absences    395 non-null    int64
 13  G1          395 non-null    int64
 14  G2          395 non-null    int64
 15  G3          395 non-null    int64
dtypes: int64(16)
memory usage: 49.5 KB


In [4]:
X = students_df.drop('G3', axis = 1)
y = students_df.G3

Разбиение данных на обучающую и тестовую выборки

In [5]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.2)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Обучение моделей

### Линейная регрессия

Реализация в sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [7]:
model_lr = LinearRegression().fit(X_train, y_train)

Оценка качества модели линейной регрессии

In [8]:
#R^2
r_sq_lr = model_lr.score(X_train, y_train)
print('coefficient of determination:', r_sq_lr)

coefficient of determination: 0.8515061651034703


In [9]:
# предсказания на основе тестовой выборки
y_pred_lr = model_lr.predict(X_test)

In [10]:
# MSE, MAE
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print('mse: %.3f, mae: %.3f' % (mse_lr, mae_lr))

mse: 5.389, mae: 1.426


In [11]:
# RMSE
rmse_lr = sqrt(mse_lr)
print('rmse: %.3f' % (rmse_lr))

rmse: 2.321


Подбор гиперпараметров

### Регрессия дерева решений

Реализация в sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

In [12]:
model_tree = DecisionTreeRegressor(random_state = 0).fit(X_train, y_train)

Оценка качества модели регрессии дерева решений

In [13]:
#R^2
r_sq_tree = model_tree.score(X_train, y_train)
print('coefficient of determination:', r_sq_tree)

coefficient of determination: 0.9999227963739704


In [14]:
# предсказания на основе тестовой выборки
y_pred_tree = model_tree.predict(X_test)

In [15]:
# MSE, MAE
mse_tree = mean_squared_error(y_test, y_pred_tree)
mae_tree = mean_absolute_error(y_test, y_pred_tree)
print('mse: %.3f, mae: %.3f' % (mse_tree, mae_tree))

mse: 2.956, mae: 0.924


In [16]:
# RMSE
rmse_tree = sqrt(mse_tree)
print('rmse: %.3f' % (rmse_tree))

rmse: 1.719


### LASSO

Реализация в sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html#sklearn.linear_model.LassoCV

In [17]:
model_lasso = LassoCV().fit(X_train, y_train)

Оценка качества модели регрессии дерева решений

In [18]:
#R^2
r_sq_lasso = model_lasso.score(X_train, y_train)
print('coefficient of determination:', r_sq_lasso)

coefficient of determination: 0.8483462998595697


In [19]:
# предсказания на основе тестовой выборки
y_pred_lasso = model_lasso.predict(X_test)

In [20]:
# MSE, MAE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
print('mse: %.3f, mae: %.3f' % (mse_lasso, mae_lasso))

mse: 5.668, mae: 1.384


In [21]:
# RMSE
rmse_lasso = sqrt(mse_lasso)
print('rmse: %.3f' % (rmse_lasso))

rmse: 2.381


### Модель гребневой регрессии

Реализация в sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

In [33]:
model_ridge = RidgeCV().fit(X_train, y_train)

Оценка качества модели регрессии дерева решений

In [23]:
#R^2
r_sq_ridge = model_ridge.score(X_train, y_train)
print('coefficient of determination:', r_sq_ridge)

coefficient of determination: 0.8514876775464333


In [24]:
# предсказания на основе тестовой выборки
y_pred_ridge = model_ridge.predict(X_test)

In [25]:
# MSE, MAE
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
print('mse: %.3f, mae: %.3f' % (mse_ridge, mae_ridge))

mse: 5.387, mae: 1.423


In [26]:
# RMSE
rmse_ridge = sqrt(mse_ridge)
print('rmse: %.3f' % (rmse_ridge))

rmse: 2.321


### Модель ElasticNet регрессии

Реализация в sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

In [27]:
model_en = ElasticNetCV().fit(X_train, y_train)

Оценка качества модели регрессии дерева решений

In [28]:
#R^2
r_sq_en = model_en.score(X_train, y_train)
print('coefficient of determination:', r_sq_en)

coefficient of determination: 0.8506891883127189


In [29]:
# предсказания на основе тестовой выборки
y_pred_en = model_en.predict(X_test)

In [30]:
# MSE, MAE
mse_en = mean_squared_error(y_test, y_pred_en)
mae_en = mean_absolute_error(y_test, y_pred_en)
print('mse: %.3f, mae: %.3f' % (mse_en, mae_en))

mse: 5.458, mae: 1.395


In [31]:
# RMSE
rmse_en = sqrt(mse_en)
print('rmse: %.3f' % (rmse_en))

rmse: 2.336


### Подбор наилучших гиперпараметров

In [71]:
parametrs = {
    'max_depth': range(1, 13, 2),
    'min_samples_leaf': range(1, 8),
    'min_samples_split': range(2, 10, 2)
}

In [72]:
grid_tree = GridSearchCV(model_tree, parametrs, cv=5)
grid_tree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'max_depth': range(1, 13, 2),
                         'min_samples_leaf': range(1, 8),
                         'min_samples_split': range(2, 10, 2)})

In [73]:
grid_tree.best_params_

{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}