In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree, naive_bayes, svm

from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier 

#Подготовка и нормализация данных

In [None]:
df = pd.read_csv('data/students_preprocessed.csv', sep = ',')
students_df = df[list(('school', 'sex', 'age', 'Pstatus', 'studytime', \
                                  'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'higher', \
                                  'internet', 'absences', 'G1', 'G2', 'G3'))]

In [None]:
students_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      395 non-null    int64
 1   sex         395 non-null    int64
 2   age         395 non-null    int64
 3   Pstatus     395 non-null    int64
 4   studytime   395 non-null    int64
 5   failures    395 non-null    int64
 6   schoolsup   395 non-null    int64
 7   famsup      395 non-null    int64
 8   paid        395 non-null    int64
 9   activities  395 non-null    int64
 10  higher      395 non-null    int64
 11  internet    395 non-null    int64
 12  absences    395 non-null    int64
 13  G1          395 non-null    int64
 14  G2          395 non-null    int64
 15  G3          395 non-null    int64
dtypes: int64(16)
memory usage: 49.5 KB


In [None]:
X = students_df.drop('G3', axis = 1)
y = students_df.G3

In [None]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size = 0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train)
X_train_positive = minmax_scaler.transform(X_train)
X_test_positive = minmax_scaler.transform(X_test)

#1. Одномерный отбор признаков

Признаки, имеющие наиболее выраженную взаимосвязь с целевой переменной, могут быть отобраны с помощью статистических критериев. Библиотека scikit-learn содержит класс SelectKBest, реализующий одномерный отбор признаков (univariate feature selection). Этот класс можно применять совместно с различными статистическими критериями для отбора заданного количества признаков.

В примере ниже используется критерий хи-квадрат (chi-squared test) для неотрицательных признаков, чтобы отобрать 4 лучших признака.

In [None]:
fit = SelectKBest(score_func=chi2, k=4).fit(X_train_positive, y_train)

np.set_printoptions(precision=3)
features = fit.transform(X_train_positive)
X_train_K_best = fit.transform(X_train_positive)
X_test_K_best = fit.transform(X_test_positive)

print(fit.scores_)

[10.387  7.779  3.517 16.809  4.358 41.915 22.013  6.728 14.857  6.134
  0.528  2.314  7.723 21.853 18.459]


#2. Рекурсивное исключение признаков

Метод рекурсивного исключения признаков (recursive feature elimination, RFE) реализует следующий алгоритм: модель обучается на исходном наборе признаков и оценивает их значимость, затем исключается один или несколько наименее значимых признаков, модель обучается на оставшихся признаках, и так далее, пока не останется заданное количество лучших признаков. В документации scikit-learn вы можете подробнее прочитать о классе RFE.

В примере ниже метод RFE применяется в сочетании с логистической регрессией для отбора 3-х лучших признаков. Для совместного использования с RFE можно выбирать различные модели, важно лишь, чтобы они были достаточно эффективны и совместимы с RFE.

In [None]:
fit = RFE(LogisticRegression(), 3).fit(X_train, y_train)

X_train_recursive = fit.transform(X_train_positive)
X_test_recursive = fit.transform(X_test_positive)

print('Num features: {}'.format(fit.n_features_))
print('Selected Features: {}'.format(fit.support_))
print('Feature Ranking: {}'.format(fit.ranking_))

Num features: 3
Selected Features: [False False False False False False False False False False False False
  True  True  True]
Feature Ranking: [ 9 12  7  6  3  2  8  5  4 11 13 10  1  1  1]


#3. Метод главных компонент

Метод главных компонент (principal component analysis, PCA) позволяет уменьшить размерность данных с помощью преобразования на основе линейной алгебры. Пользователь может задать требуемое количество измерений (главных компонент) в результирующих данных.

В примере ниже мы выделяем 3 главных компоненты с помощью PCA.

In [None]:
fit = PCA(n_components=3).fit(X_train)
features = fit.transform(X_train)

X_train_pca = fit.transform(X_train_positive)
X_test_pca = fit.transform(X_test_positive)

print(fit.explained_variance_ratio_)
print(features[0:5,:])

[0.157 0.125 0.1  ]
[[ 3.825  1.881  1.172]
 [ 0.065 -0.753 -1.222]
 [ 0.237 -1.884 -1.453]
 [ 0.289 -0.898  0.461]
 [-1.256  1.321  0.248]]


#4. Отбор на основе важности признаков

Ансамблевые алгоритмы на основе деревьев решений, такие как случайный лес (random forest), позволяют оценить важность признаков.

В представленном ниже примере мы обучаем классификатор ExtraTreesClassifier, чтобы с его помощью определить важность признаков. Подробнее о классе ExtraTreesClassifier можно узнать из документации scikit-learn.

In [None]:
fit = ExtraTreesClassifier().fit(X_train, y_train)
print(fit.feature_importances_)

[0.02  0.045 0.093 0.023 0.07  0.044 0.023 0.046 0.034 0.049 0.008 0.032
 0.127 0.155 0.231]


In [None]:
important = fit.feature_importances_ > 0.1
X_train_etc = X_train_positive[:, important]
X_test_etc = X_test_positive[:, important]

#Обучение модели

In [None]:
def fit_predict(X_train, X_test, y_train, y_test, model, init_parameters={}, **parameters):
  classifier = model(**init_parameters)
  clf_model = GridSearchCV(classifier, parameters)
  clf_model.fit(X_train, y_train)
  y_pred = clf_model.best_estimator_.predict(X_test)

  print('GridSearchCV - best estimator: {}'.format(clf_model.best_estimator_))
  print('best parameters: {}'.format(clf_model.best_params_))
  
  print(confusion_matrix(y_test, y_pred))
  print(classification_report(y_test, y_pred))

In [None]:
def selections_features(model, init_parameters={}, **parameters):
  print('Без отбора признаков\n')
  fit_predict(X_train, X_test, y_train, y_test, model=model, init_parameters=init_parameters, **parameters)

  print('\n\n\nОдномерный отбор признаков\n')
  fit_predict(X_train_K_best, X_test_K_best, y_train, y_test, model=model, init_parameters=init_parameters, **parameters)

  print('\n\n\nРекурсивное исключение признаков\n')
  fit_predict(X_train_recursive, X_test_recursive, y_train, y_test, model=model, init_parameters=init_parameters, **parameters)

  print('\n\n\nМетод главных компонент\n')
  fit_predict(X_train_pca, X_test_pca, y_train, y_test, model=model, init_parameters=init_parameters, **parameters)

  print('\n\n\nОтбор на основе важности признаков\n')
  fit_predict(X_train_etc, X_test_etc, y_train, y_test, model=model, init_parameters=init_parameters, **parameters)

#Decision Tree Classifier (DTC)

In [None]:
parameters_dtc = {
    'max_depth': [1, 3, 5, 10, 30],
    'min_samples_split': [2, 3, 5, 8]
}
selections_features(model=tree.DecisionTreeClassifier, **parameters_dtc)

Без отбора признаков

GridSearchCV - best estimator: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
best parameters: {'max_depth': 5, 'min_samples_split': 5}
[[6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 2 1 2 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 2 4 5 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 1 2 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 4 2 2 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 3 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 2 3 2 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 6 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0]
 [0 0 0 0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           5       1.00      1.00      1.00         1
           6       0.67      0.50      0.57         4
           7       0.00      0.00      0.00         2
           8       0.25      0.33      0.29         6
           9       0.25      0.25      0.25         4
          10       0.36      0.33      0.35        12
          11       0.17      0.12      0.14         8
          12       0.50      0.44      0.47         9
          13       0.33      0.75      0.46         4
          14       0.60      0.38      0.46         8
          15       0.60      0.86      0.71         7
          16       0.50      0.50      0.50         4
          17       0.00      0.00      0.00         1
          18       0.67      1.00      0.80         2
          19       0.00      0.00      0.00         1

    accuracy                           0.47        79
   macro avg       0.42   

  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV - best estimator: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=30, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
best parameters: {'max_depth': 30, 'min_samples_split': 8}
[[2 0 0 0 3 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [3 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0]
 [1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 1 5 4 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 2 5 0 1 0 0 0 0 0 0]
 [1 0 0 0 2 0 0 2 1 2 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 2 2 1 2 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 4 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Support Vector Machines (SVM)

In [17]:
parameters_svm = {
    'kernel': ['linear', 'rbf']
}
selections_features(model=svm.SVC, **parameters_svm)

Без отбора признаков

GridSearchCV - best estimator: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
best parameters: {'kernel': 'linear'}
[[4 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 2 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 3 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 1 7 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 4 0 3 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 3 4 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 1 2 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 1 3 3 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 5 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
              precision   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV - best estimator: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
best parameters: {'kernel': 'rbf'}
[[5 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [3 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 5 6 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 6 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 5 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 3 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 2 0 3 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 6 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
              precision    recall  f1-score   support

           0       0.50      0.83      0.62         6
           5  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV - best estimator: SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
best parameters: {'kernel': 'rbf'}
[[5 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 3 0 2 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 8 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 6 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 4 0 5 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 3 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 4 0 4 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 6 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
              precision    recall  f1-score   support

           0       0.45      0.83      0.59         6
           5  

  _warn_prf(average, modifier, msg_start, len(result))


#Naive Bayes (NB)

In [19]:
parameters_nb = {
    'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-5, 1e-2, 1e-1, 1, 10, 100]
}
selections_features(model=naive_bayes.GaussianNB, **parameters_nb)

Без отбора признаков

GridSearchCV - best estimator: GaussianNB(priors=None, var_smoothing=0.01)
best parameters: {'var_smoothing': 0.01}
[[5 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 2 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 2 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 3 4 2 0 2 0 0 0 0 0]
 [0 0 0 0 0 1 0 3 0 2 2 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 1 1 6 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 7 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 6 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
              precision    recall  f1-score   support

           0       1.00      0.83      0.91         6
           5       0.50      1.00      0.67         1
           6       0.33      0.25      0.29         4
           7       0.00      0.00      0.00         2
           8       0.67      0

  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV - best estimator: GaussianNB(priors=None, var_smoothing=0.01)
best parameters: {'var_smoothing': 0.01}
[[5 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 3 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 1 1 1 2 7 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 2 3 0 0 3 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 0 1 6 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 7 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 6 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           5       0.00      0.00      0.00         1
           6       0.50      0.75      0.60         4
           7       0.00      0.00      0.00         2
           8       0.60      0.50      0.55         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GridSearchCV - best estimator: GaussianNB(priors=None, var_smoothing=0.1)
best parameters: {'var_smoothing': 0.1}
[[2 0 0 1 0 0 2 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 2 2 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0]
 [2 0 0 0 0 0 3 6 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 2 6 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 5 1 1 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 4 1 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 2 0 0 0 5 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]]
              precision    recall  f1-score   support

           0       0.18      0.33      0.24         6
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Вывод

В данной работе были применены четыре предложенных метода отбора признаков  к своим данным для классификации. Оценено качество работы двух алгоритмов классификации на этих данных до и после отбора признаков.