### Отчет по лабораторной работе №4 "Подготовка обучающей и тестовой выборки, кросс-валидация и подбор гиперпараметров на примере метода ближайших соседей"

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_boston
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, LeavePOut, ShuffleSplit, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline 
sns.set(style="ticks")

### подготовление датасета

In [2]:
# cторонняя компания по страхованию путешествий, базирующаяся в Сингапуре
data = pd.read_csv('лр4.csv', sep=',')

In [3]:
data.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


In [4]:
data.shape

(63326, 11)

In [5]:
data.dtypes

Agency                   object
Agency Type              object
Distribution Channel     object
Product Name             object
Claim                    object
Duration                  int64
Destination              object
Net Sales               float64
Commision (in value)    float64
Gender                   object
Age                       int64
dtype: object

In [6]:
# удаление пропусков
data = data.drop(columns='Gender')
data.isnull().sum()

Agency                  0
Agency Type             0
Distribution Channel    0
Product Name            0
Claim                   0
Duration                0
Destination             0
Net Sales               0
Commision (in value)    0
Age                     0
dtype: int64

In [7]:
# преобразование типов
cat_coll = []
for col in data.columns:
    if data[col].dtype == 'object':
        cat_coll.append(col)
en_cat = {}
for col in cat_coll:
    le = LabelEncoder()
    data[[col]] = le.fit_transform(data[col])  
    en_cat[col] = le

In [8]:
data.dtypes

Agency                    int64
Agency Type               int64
Distribution Channel      int64
Product Name              int64
Claim                     int64
Duration                  int64
Destination               int64
Net Sales               float64
Commision (in value)    float64
Age                       int64
dtype: object

In [9]:
# разделение выборки на обучающую и тестовую
# разделение исходного датасета на 2: с содержанием независимых и зависимых параметров соответственно
x, y = data[data.columns[range(9)]], data[data.columns[[9]]]

In [10]:
x.dtypes

Agency                    int64
Agency Type               int64
Distribution Channel      int64
Product Name              int64
Claim                     int64
Duration                  int64
Destination               int64
Net Sales               float64
Commision (in value)    float64
dtype: object

In [11]:
y.dtypes

Age    int64
dtype: object

In [12]:
test_size = 0.2
state = 42
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=test_size, random_state=state)
len(xTrain), len(xTest), len(yTrain), len(yTest)

(50660, 12666, 50660, 12666)

### обучение модели на произвольном гиперпараметре к

In [13]:
# значения возраста от 0 до 118, поэтому используется классифаер
print('количество классов:', len(data['Age'].unique()))

количество классов: 89


In [14]:
# обучение модели для произвольно заданного гиперпараметра к
k = 3
KNeighborsClassifierObj = KNeighborsClassifier(n_neighbors=k)
KNeighborsClassifierObj

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [15]:
KNeighborsClassifierObj.fit(xTrain, yTrain.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [16]:
# обучение модели для произвольно заданного гиперпараметра к
y_pred = KNeighborsClassifierObj.predict(xTest)
y_pred

array([36, 31, 35, ..., 36, 26, 22])

In [17]:
yTest['Age'].values

array([36, 68, 36, ..., 36, 61, 36])

In [18]:
# оценивание качество модели классификации
print('Accuracy: {} %'.format(accuracy_score(yTest, y_pred) * 100))

Accuracy: 36.009789988946785 %


In [19]:
print('матрица ошибок: столбцы – предсказанное значение, строки – истинное значение')
print(confusion_matrix(yTest, y_pred))

матрица ошибок: столбцы – предсказанное значение, строки – истинное значение
[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   1   0 ...   0   0 145]]


In [20]:
warnings.filterwarnings("ignore")
precision_score(yTest, y_pred, average='weighted')

0.33407827460523043

In [21]:
warnings.filterwarnings("ignore")
f1_score(yTest, y_pred, average='weighted')

0.3415855080250703

### оценивание качества с использованием кросс-валидации

In [22]:
# автоматически выбирается стратегия
warnings.filterwarnings("ignore")
scores = cross_val_score(KNeighborsClassifierObj, x, y.values.ravel(), cv=3)  # 3 фолда
scores  # дефолтная точность

array([0.36716828, 0.35783292, 0.34903691])

In [23]:
np.mean(scores)

0.35801270300862303

In [24]:
warnings.filterwarnings("ignore")
scoring = {'precision': 'precision_weighted', 
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}
scores = cross_validate(KNeighborsClassifierObj, x, y.values.ravel(), scoring=scoring, cv=3, return_train_score=True)
scores

{'fit_time': array([0.04571533, 0.03696084, 0.03635406]),
 'score_time': array([2.02484369, 2.02546597, 1.96897411]),
 'test_precision': array([0.31211565, 0.3422917 , 0.33507145]),
 'train_precision': array([0.53744392, 0.53315717, 0.54169843]),
 'test_recall': array([0.36716828, 0.35783292, 0.34903691]),
 'train_recall': array([0.53991089, 0.53506278, 0.54381272]),
 'test_f1': array([0.33180252, 0.34690815, 0.33772616]),
 'train_f1': array([0.50527049, 0.49931282, 0.50993471])}

In [25]:
# k-fold
scores = cross_val_score(KNeighborsClassifierObj, x, y.values.ravel(), cv=KFold(n_splits=12))
scores

array([0.42307692, 0.43198181, 0.42675763, 0.41500853, 0.41083949,
       0.37976123, 0.36687512, 0.36573811, 0.34451393, 0.36592761,
       0.30452909, 0.06992609])

In [26]:
np.mean(scores)

0.3587446298601736

In [27]:
warnings.filterwarnings("ignore")
scoring = {'precision': 'precision_weighted', 
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'}
scores = cross_validate(KNeighborsClassifierObj, x, y.values.ravel(), scoring=scoring, cv=KFold(n_splits=12), return_train_score=True)
scores

{'fit_time': array([0.05430412, 0.05568624, 0.05644512, 0.05918002, 0.0708642 ,
        0.07882404, 0.05157709, 0.05013418, 0.05748987, 0.04997611,
        0.05724716, 0.05000305]),
 'score_time': array([0.54452896, 0.55925202, 0.56093073, 0.70453525, 1.24807501,
        0.80153489, 0.6492188 , 0.53815699, 0.60757995, 0.60229611,
        0.55408096, 0.53841782]),
 'test_precision': array([0.40360718, 0.41881941, 0.42575506, 0.38617808, 0.41218985,
        0.38881253, 0.3757869 , 0.37324142, 0.33309092, 0.35547952,
        0.27797665, 0.12045868]),
 'train_precision': array([0.5288413 , 0.52517154, 0.52217002, 0.52866397, 0.52395988,
        0.53000046, 0.53086145, 0.52891541, 0.53410446, 0.53037896,
        0.5356074 , 0.55322713]),
 'test_recall': array([0.42307692, 0.43198181, 0.42675763, 0.41500853, 0.41083949,
        0.37976123, 0.36687512, 0.36573811, 0.34451393, 0.36592761,
        0.30452909, 0.06992609]),
 'train_recall': array([0.52678818, 0.52592682, 0.52366104, 0.52927699, 

In [None]:
# LeavePOut(2)
scores = cross_val_score(KNeighborsClassifierObj, x, y.values.ravel(), cv=LeavePOut(2))
scores