In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree, naive_bayes, svm

# Подготовка и нормализация данных

In [58]:
df = pd.read_csv('data/students_preprocessed.csv', sep = ',')
students_df = df[list(('school', 'sex', 'age', 'Pstatus', 'studytime', \
                                  'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'higher', \
                                  'internet', 'absences', 'G1', 'G2', 'G3'))]

In [60]:
students_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      395 non-null    int64
 1   sex         395 non-null    int64
 2   age         395 non-null    int64
 3   Pstatus     395 non-null    int64
 4   studytime   395 non-null    int64
 5   failures    395 non-null    int64
 6   schoolsup   395 non-null    int64
 7   famsup      395 non-null    int64
 8   paid        395 non-null    int64
 9   activities  395 non-null    int64
 10  higher      395 non-null    int64
 11  internet    395 non-null    int64
 12  absences    395 non-null    int64
 13  G1          395 non-null    int64
 14  G2          395 non-null    int64
 15  G3          395 non-null    int64
dtypes: int64(16)
memory usage: 49.5 KB


In [61]:
X = students_df.drop('G3', axis = 1)
y = students_df.G3

Разбиение данных на обучающую и тестовую выборки

In [62]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.2)

In [63]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Обучение алгоритмов

## KNN (метод k-ближайших соседей)

In [73]:
clf_KNN = KNeighborsClassifier(n_neighbors = 16)
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=16)

In [79]:
y_pred_KNN = clf_KNN.predict(X_test)
print(classification_report(y_test, y_pred_KNN))

              precision    recall  f1-score   support

           0       0.54      0.64      0.58        11
           5       0.00      0.00      0.00         2
           6       1.00      0.50      0.67         4
           7       0.00      0.00      0.00         2
           8       0.17      0.20      0.18         5
           9       0.00      0.00      0.00         1
          10       0.33      0.58      0.42        12
          11       0.14      0.08      0.11        12
          12       0.00      0.00      0.00         9
          13       0.00      0.00      0.00         2
          14       0.25      0.33      0.29         3
          15       0.29      0.33      0.31         6
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         2
          19       0.00      0.00      0.00         2

    accuracy                           0.27        79
   macro avg       0.17   

  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree Classifier (DTC)

In [75]:
clf_DTC = tree.DecisionTreeClassifier()
clf_DTC = clf_DTC.fit(X_train, y_train)

In [77]:
y_pred_DTC = clf_DTC.predict(X_test)
print(classification_report(y_test, y_pred_DTC))

              precision    recall  f1-score   support

           0       1.00      0.82      0.90        11
           4       0.00      0.00      0.00         0
           5       0.50      0.50      0.50         2
           6       1.00      0.50      0.67         4
           7       0.00      0.00      0.00         2
           8       0.43      0.60      0.50         5
           9       0.20      1.00      0.33         1
          10       0.50      0.50      0.50        12
          11       0.17      0.17      0.17        12
          12       0.00      0.00      0.00         9
          13       0.00      0.00      0.00         2
          14       0.33      0.33      0.33         3
          15       0.40      0.33      0.36         6
          16       0.40      0.50      0.44         4
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         2
          19       0.00      0.00      0.00         2
          20       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Naive Bayes (NB)

In [81]:
clf_NB = naive_bayes.GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [83]:
y_pred_NB = clf_NB.predict(X_test)
print(classification_report(y_test, y_pred_NB))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           5       0.33      0.50      0.40         2
           6       0.19      0.75      0.30         4
           7       0.00      0.00      0.00         2
           8       0.11      0.20      0.14         5
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00        12
          11       0.50      0.42      0.45        12
          12       0.00      0.00      0.00         9
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         6
          16       0.06      0.25      0.09         4
          17       0.00      0.00      0.00         2
          18       0.20      0.50      0.29         2
          19       0.50      0.50      0.50         2

    accuracy                           0.30        79
   macro avg       0.18   

  _warn_prf(average, modifier, msg_start, len(result))


## Support Vector Machines (SVM)

In [84]:
clf_SVM = svm.SVC()
clf_SVM.fit(X_train, y_train)

SVC()

In [86]:
y_pred_SVM = clf_SVM.predict(X_test)
print(classification_report(y_test, y_pred_SVM))

              precision    recall  f1-score   support

           0       0.80      0.73      0.76        11
           5       0.00      0.00      0.00         2
           6       1.00      0.25      0.40         4
           7       0.00      0.00      0.00         2
           8       0.14      0.20      0.17         5
           9       0.00      0.00      0.00         1
          10       0.31      0.67      0.42        12
          11       0.29      0.17      0.21        12
          12       0.00      0.00      0.00         9
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         3
          15       0.33      0.83      0.48         6
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         2
          19       0.00      0.00      0.00         2

    accuracy                           0.32        79
   macro avg       0.18   

  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression (LR)

In [87]:
clf_LR = LogisticRegression()
clf_LR.fit(X_train, y_train)

LogisticRegression()

In [88]:
y_pred_LR = clf_LR.predict(X_test)
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        11
           5       0.00      0.00      0.00         2
           6       1.00      0.75      0.86         4
           7       0.00      0.00      0.00         2
           8       0.40      0.80      0.53         5
           9       0.00      0.00      0.00         1
          10       0.38      0.42      0.40        12
          11       0.40      0.33      0.36        12
          12       0.67      0.22      0.33         9
          13       0.12      0.50      0.20         2
          14       0.33      0.33      0.33         3
          15       0.40      0.67      0.50         6
          16       0.00      0.00      0.00         4
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         2
          19       0.00      0.00      0.00         2

    accuracy                           0.42        79
   macro avg       0.28   

  _warn_prf(average, modifier, msg_start, len(result))


# Подбор гиперпараметров

### KNN

In [112]:
%%time
#KNN
parameters = {'n_neighbors': range(1, 17)}

knn = KNeighborsClassifier()
clf_hp_KNN = GridSearchCV(knn, parameters)
clf_hp_KNN.fit(X_train, y_train)



Wall time: 629 ms


GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 17)})

In [113]:
print(clf_hp_KNN.best_estimator_)

KNeighborsClassifier(n_neighbors=13)


### DTC

In [101]:
%%time
#DTC
parameters = {
    'max_depth': [1, 3, 5, 10, 100],
    'min_samples_split': [2, 3, 5, 8]
}

dtc = tree.DecisionTreeClassifier()
clf_hp_DTC = GridSearchCV(dtc, parameters)
clf_hp_DTC.fit(X_train, y_train)



Wall time: 291 ms


GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 3, 5, 10, 100],
                         'min_samples_split': [2, 3, 5, 8]})

In [102]:
print(clf_hp_DTC.best_estimator_)

DecisionTreeClassifier(max_depth=5)


### NB

In [103]:
%%time
#NB
parameters = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-5, 1e-2, 1e-1, 1, 10, 100]
}

nb = naive_bayes.GaussianNB()
clf_hp_NB = GridSearchCV(nb, parameters)
clf_hp_NB.fit(X_train, y_train)



Wall time: 256 ms


GridSearchCV(estimator=GaussianNB(),
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-05, 0.01,
                                           0.1, 1, 10, 100]})

In [105]:
print(clf_hp_NB.best_estimator_)

GaussianNB(var_smoothing=0.01)


### SVM

In [106]:
%%time
#SVM
parameters = {
    'kernel': ['linear', 'rbf']
}

svm = svm.SVC()
clf_hp_SVM = GridSearchCV(svm, parameters, cv=3)
clf_hp_SVM.fit(X_train, y_train)

Wall time: 125 ms




GridSearchCV(cv=3, estimator=SVC(), param_grid={'kernel': ['linear', 'rbf']})

In [107]:
print(clf_hp_SVM.best_estimator_)

SVC(kernel='linear')


### LR

In [108]:
%%time
#LR
parameters = {
    'class_weight': ['balanced'],
    'solver': ['liblinear', 'lbfgs', 'newton-cg'],
    'penalty': ['l1', 'l2'],
}

lr = LogisticRegression()

clf_hp_LR = GridSearchCV(lr, parameters, cv=3)
clf_hp_LR.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.

Wall time: 593 ms


GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'class_weight': ['balanced'], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'lbfgs', 'newton-cg']})

In [109]:
print(clf_hp_LR.best_estimator_)

LogisticRegression(class_weight='balanced')
