# Подготовка данных

## Импорт данных

In [1]:
import pandas as pd
dataset = pd.read_csv('../data/adult_preprocessed.csv')
dataset.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,0,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Аномалии

Исходя из данных, полученных в файле 1_visualization можно сделать вывод, что аномалии имеются у параметров *hours-peer-week* и *capital-gain*. Проверим, что произойдет с данными, если отбросить значения на расстоянии более 1.5\*IQR от квартилей Q1 и Q3

In [None]:
first_quartile = dataset['hours-per-week'].describe()['25%']
third_quartile = dataset['hours-per-week'].describe()['75%']

iqr = third_quartile - first_quartile

dataset[(dataset['hours-per-week'] > (first_quartile - 3 * iqr)) &
            (dataset['hours-per-week'] < (third_quartile + 3 * iqr))].info()

Для параметра *hours-per-week* это приводит к потере значительной части данных, кроме того, нельзя исключать, что люди действительно столько работают. Проверим для второго парметра

In [3]:
first_quartile = dataset['capital-gain'].describe()['25%']
third_quartile = dataset['capital-gain'].describe()['75%']

iqr = third_quartile - first_quartile

dataset[(dataset['capital-gain'] > (first_quartile - 3 * iqr)) &
            (dataset['capital-gain'] < (third_quartile + 3 * iqr))].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 88 columns):
 #   Column                                     Non-Null Count  Dtype
---  ------                                     --------------  -----
 0   age                                        0 non-null      int64
 1   fnlwgt                                     0 non-null      int64
 2   education-num                              0 non-null      int64
 3   sex                                        0 non-null      int64
 4   capital-gain                               0 non-null      int64
 5   capital-loss                               0 non-null      int64
 6   hours-per-week                             0 non-null      int64
 7   salary                                     0 non-null      int64
 8   workclass_Federal-gov                      0 non-null      int64
 9   workclass_Local-gov                        0 non-null      int64
 10  workclass_Private                          0 non-null      int

Для второго параметра это приводит к полной потере данных

## Разбиение данных

Для начала разобьем данные на множество описаний объектов и множество меток:

In [4]:
X = dataset.drop('salary', 1).values
y = dataset['salary'].values

Разобьем на обучающую и тестовую выборки:

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

Проверим соотношение разбиения:

In [9]:
from collections import Counter

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Нормализуем параметры:

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Классификация

## KNN

Произведем обучение и проверим точность классификации алгоритма KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4118  413]
 [ 661  841]]
              precision    recall  f1-score   support

           0       0.86      0.91      0.88      4531
           1       0.67      0.56      0.61      1502

    accuracy                           0.82      6033
   macro avg       0.77      0.73      0.75      6033
weighted avg       0.81      0.82      0.82      6033



Произведем кросс-валидацию

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf = KNeighborsClassifier(n_neighbors=5)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))
  print(clf.score(X_test, y_test))

Полученные значения на разных итерациях примерно равны. Можно сделать вывод, что данные устойчивы

Произведем поиск гиперпараметров:

In [15]:
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

parameters = {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

knn = KNeighborsClassifier()

clf = GridSearchCV(knn, parameters)

clf.fit(X_train, y_train)

clf.best_params_

{'n_neighbors': 10}

Проверим работу алгоритма с k = 9

In [None]:
clf = KNeighborsClassifier(n_neighbors=9)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Точность работы алгоритма немного увеличилась относительно k = 5

## DTC

Произведем обучение и проверим точность классификации алгоритма DTC

In [None]:
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Произведем кросс-валидацию

In [None]:
skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf = tree.DecisionTreeClassifier()
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))
  print(clf.score(X_test, y_test))

Полученные значения на разных итерациях примерно равны. Можно сделать вывод, что данные устойчивы

Произведем поиск гиперпараметров:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

parameters = {'max_depth': range(1,11),'max_features': range(4,19)}

dtc = tree.DecisionTreeClassifier()

clf = GridSearchCV(dtc, parameters)

clf.fit(X_train, y_train)

clf.best_params_

Проверим работу алгоритма с max_depth = 9 и max_features = 17

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = tree.DecisionTreeClassifier(max_depth = 9, max_features = 17)
clf = clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Точность ощутимо повысилась

## NB

Произведем обучение и проверим точность классификации алгоритма NB

In [None]:
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Произведем кросс-валидацию

In [None]:
skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf = GaussianNB()
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))
  print(clf.score(X_test, y_test))

Полученные значения на разных итерациях примерно равны. Можно сделать вывод, что данные устойчивы

Произведем поиск гиперпараметров:

In [None]:
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

gnb = GaussianNB()

clf = GridSearchCV(gnb, parameters)

clf.fit(X_train, y_train)

clf.best_params_

Проверим работу алгоритма с var_smoothing = 0.0657933224657568

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = GaussianNB(var_smoothing = 0.0657933224657568)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Точность примерна равна точности со значением по умолчанию

## SVM

Произведем обучение и проверим точность классификации алгоритма SVM

In [13]:
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = svm.SVC()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[4214  317]
 [ 633  869]]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      4531
           1       0.73      0.58      0.65      1502

    accuracy                           0.84      6033
   macro avg       0.80      0.75      0.77      6033
weighted avg       0.84      0.84      0.84      6033



Произведем кросс-валидацию

In [None]:
skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf = svm.SVC()
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))
  print(clf.score(X_test, y_test))

Полученные значения на разных итерациях примерно равны. Можно сделать вывод, что данные устойчивы

Произведем поиск гиперпараметров:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

parameters = {'C': [0.1,1, 10, 100]}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters)

clf.fit(X_train, y_train)

clf.best_params_

Проверим работу алгоритма с C = , gamma = , kernel = 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = svm.SVC()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## LR

Произведем обучение и проверим точность классификации алгоритма LR

In [None]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Произведем кросс-валидацию

In [None]:
skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(X, y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf = LogisticRegression()
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))
  print(clf.score(X_test, y_test))

Полученные значения на разных итерациях примерно равны. Можно сделать вывод, что данные устойчивы

Произведем поиск гиперпараметров:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

parameters = {'C': np.logspace(-4, 4, 50), 'penalty': ['l1', 'l2']}

lr = LogisticRegression()

clf = GridSearchCV(lr, parameters)

clf.fit(X_train, y_train)

clf.best_params_

Проверим работу алгоритма с C = , penalty = 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Выводы по работе алгоритмов на данной модели