In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, \
StackingClassifier, BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Classification

### Подготовка данных

In [3]:
df = pd.read_csv('data/students_preprocessed.csv', sep = ',')
students_df = df[list(('school', 'sex', 'age', 'Pstatus', 'studytime', \
                                  'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'higher', \
                                  'internet', 'absences', 'G1', 'G2', 'G3'))]

In [4]:
students_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      395 non-null    int64
 1   sex         395 non-null    int64
 2   age         395 non-null    int64
 3   Pstatus     395 non-null    int64
 4   studytime   395 non-null    int64
 5   failures    395 non-null    int64
 6   schoolsup   395 non-null    int64
 7   famsup      395 non-null    int64
 8   paid        395 non-null    int64
 9   activities  395 non-null    int64
 10  higher      395 non-null    int64
 11  internet    395 non-null    int64
 12  absences    395 non-null    int64
 13  G1          395 non-null    int64
 14  G2          395 non-null    int64
 15  G3          395 non-null    int64
dtypes: int64(16)
memory usage: 49.5 KB


In [5]:
X = students_df.drop('G3', axis = 1)
y = students_df.G3

#### Разбиение данных на обучающую и тестовую выборки

In [22]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.25)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
def print_classification_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

### DTC

### Анализ базовой модели DecisionTreeClassifier

In [23]:
%%time
tree_params = {
    'max_depth': range(1, 17, 2),
    'min_samples_leaf': range(1, 8),
    'min_samples_split': range(2, 10, 2)
}

tree = DecisionTreeClassifier(random_state=0)
tree_base = GridSearchCV(tree, tree_params, cv=5).fit(X_train, y_train)
tree_base.best_params_

Wall time: 11.8 s


{'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [24]:
print_classification_metrics(tree_base, y_test, tree_base.predict(X_test))

[[10  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  0  0  3  0  4  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 4  0  0  0  0  8  4  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  5 10  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  5  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  7  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  3  0  6  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  7  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2  0]]
              precision    recall  f1-score   support

           0       0.56      1.00      0.71        10
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           8       0.43      0.33      0.38         9
        

In [11]:
ensemble_parameters = {'n_estimators': np.arange(20,101,20),
                       'max_features': np.arange(3,24,10)}

### BaggingClassifier

In [25]:
%%time
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=tree_base.best_params_['max_depth']))
bagging_model = GridSearchCV(bagging, ensemble_parameters).fit(X_train, y_train)
bagging_model.best_params_

Wall time: 12.1 s


{'max_features': 13, 'n_estimators': 80}

In [26]:
print_classification_metrics(bagging_model, y_test, bagging_model.predict(X_test))

[[ 9  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  1  0  2  0  4  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 1  0  0  0  0 11  4  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  6  5  5  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  3  2  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  2  7  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  2  3  0  6  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  7  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  2  0]]
              precision    recall  f1-score   support

           0       0.69      0.90      0.78        10
           6       0.75      1.00      0.86         3
           7       0.00      0.00      0.00         1
           8       0.67      0.22      0.33         9
        

### GradientBoostingClassifier

In [27]:
%%time
gbc = GradientBoostingClassifier()
gbc_model = GridSearchCV(gbc, ensemble_parameters).fit(X_train, y_train)
gbc_model.best_params_

Wall time: 1min 20s


{'max_features': 3, 'n_estimators': 100}

In [28]:
print_classification_metrics(gbc_model, y_test, gbc_model.predict(X_test))

[[6 0 0 3 1 0 0 0 0 0 0 0 0 0 0]
 [0 3 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 4 3 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 3 8 2 2 0 0 0 0 0 0 0]
 [0 0 0 0 2 1 9 3 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 2 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 2 4 4 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 4 3 3 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 5 2 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]]
              precision    recall  f1-score   support

           0       0.75      0.60      0.67        10
           6       1.00      1.00      1.00         3
           7       0.00      0.00      0.00         1
           8       0.57      0.44      0.50         9
           9       0.10      0.50      0.17         2
          10       0.73      0.50      0.59        16
          11       0.64      0.53      0.58        17
          12       0.20      0.29      0.24         7
          13     

### StackingClassifier

In [29]:
%%time
sc_model = StackingClassifier(estimators=[('bag',bagging), ('gbc',gbc)],
                           final_estimator=tree_base).fit(X_train, y_train)

Wall time: 28.6 s


In [30]:
print_classification_metrics(sc_model, y_test, sc_model.predict(X_test))

[[3 0 0 5 0 1 1 0 0 0 0 0 0 0 0]
 [0 1 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 4 2 2 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 2 6 4 1 0 2 0 0 0 0 0]
 [0 0 0 0 0 5 5 5 0 2 0 0 0 0 0]
 [0 0 0 0 0 1 0 5 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 2 5 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 2 1 7 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 7 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 2 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]]
              precision    recall  f1-score   support

           0       1.00      0.30      0.46        10
           6       0.50      0.33      0.40         3
           7       0.00      0.00      0.00         1
           8       0.29      0.44      0.35         9
           9       0.00      0.00      0.00         2
          10       0.38      0.38      0.38        16
          11       0.50      0.29      0.37        17
          12       0.33      0.71      0.45         7
          13     

# Regression

### Подготовка данных

In [23]:
df = pd.read_csv('data/students_preprocessed.csv', sep = ',')
students_df = df[list(('school', 'sex', 'age', 'Pstatus', 'studytime', \
                                  'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'higher', \
                                  'internet', 'absences', 'G1', 'G2', 'G3'))]

In [24]:
X = students_df.drop('G3', axis = 1)
y = students_df.G3

In [31]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.15)

In [32]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [33]:
def get_regression_metrics(estimator, y_test, y_pred):
    print(f"coefficient of determination: {estimator.score(X_train, y_train)}")
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

### DTR

### Анализ базовой модели DecisionTreeRegressor

In [34]:
%%time
tr = DecisionTreeRegressor(random_state=0).fit(X_train, y_train)
tr_base = GridSearchCV(tr, tree_params, cv=5).fit(X_train, y_train)
tr_base.best_params_

Wall time: 5.02 s


{'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 8}

In [35]:
get_regression_metrics(tr_base, y_test, tr_base.predict(X_test))

coefficient of determination: 0.9532906378863759
MSE: 1.3830036356140254
RMSE: 1.1760117497771974
MAE: 0.81989898989899


### BaggingRegressor

In [36]:
%%time
br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=tr_base.best_params_['max_depth']))
br_model = GridSearchCV(br, ensemble_parameters).fit(X_train, y_train)
br_model.best_params_

Wall time: 12.3 s


{'max_features': 13, 'n_estimators': 100}

In [37]:
get_regression_metrics(br_model, y_test, br_model.predict(X_test))

coefficient of determination: 0.9781124758432185
MSE: 1.1012516243764934
RMSE: 1.0494053670419707
MAE: 0.7502030676091604


### GradientBoostingRegressor

In [38]:
%%time
gbr = GradientBoostingRegressor()
gbr_model = GridSearchCV(gbr, ensemble_parameters).fit(X_train, y_train)
gbr_model.best_params_

Wall time: 2.68 s


{'max_features': 13, 'n_estimators': 80}

In [39]:
get_regression_metrics(gbr_model, y_test, gbr_model.predict(X_test))

coefficient of determination: 0.9459987702263646
MSE: 1.7150369182548448
RMSE: 1.3095941807502218
MAE: 0.95679223856274


### StackingRegressor

In [40]:
%%time
sr = StackingRegressor(estimators=[('br',br), ('gbr',gbr)],
                           final_estimator=tr_base).fit(X_train, y_train)

Wall time: 4.67 s


In [41]:
get_regression_metrics(sr, y_test, sr.predict(X_test))

coefficient of determination: 0.9329813392947637
MSE: 3.2800049134992406
RMSE: 1.811078384140024
MAE: 1.1215765755912668


### Выводы

Для обучения композиционных моделей необходимо больше времени, но использование композиций алгоритмов позволяет повысить качество моделей.