# Учебная практика
## Тема: Сравнительный анализ методов машинного обучения, применяемых для прогнозирования в задачах регрессии и классификации

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

###### Ссылки на датасеты:
- [Датасет для предсказания дохода людей](https://www.kaggle.com/wenruliu/adult-income-dataset) (задача классификации)
- [Датасет для предсказания выплаты кредита](https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset) (задача классификации)
- [Датасет для предсказания стоимости бриллиантов](https://www.kaggle.com/shivam2503/diamonds) (задача регрессии)
- [Датасет для предсказания стоимости домов](https://www.kaggle.com/harlfoxem/housesalesprediction/version/1) (задача регрессии)

In [3]:
adult_df = pd.read_csv('data/adult.csv')
credit_df = pd.read_csv('data/UCI_Credit_Card.csv')
diam_df = pd.read_csv('data/diamonds.csv')
house_df = pd.read_csv('data/kc_house_data.csv')

In [4]:
credit_df = pd.DataFrame(credit_df.drop("ID", axis=1))
diam_df = pd.DataFrame(diam_df.drop("Unnamed: 0", axis=1))

### Изучаем датасеты, подготавливаем данные для применения машинного обучения

In [5]:
house_df.sample(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
10443,3438502501,20140729T000000,400000.0,5,2.5,2510,7525,1.5,0,0,...,7,1710,800,1929,0,98106,47.5422,-122.359,1270,6741
15198,1422069069,20150205T000000,426500.0,4,2.75,2100,88426,1.0,0,0,...,6,2100,0,1990,0,98038,47.399,-122.011,2150,63162
17274,7893801760,20141117T000000,368000.0,3,1.75,2120,11340,1.0,0,3,...,7,1060,1060,1966,0,98198,47.4109,-122.329,1830,8650
21488,1853080150,20140811T000000,890776.0,5,2.75,3170,8093,2.0,0,0,...,9,3170,0,2014,0,98075,47.5933,-122.06,3210,7062
6682,8899000430,20150217T000000,325500.0,4,1.75,2290,8142,1.0,0,0,...,7,1490,800,1969,0,98055,47.4564,-122.211,1840,8142


In [6]:
credit_df.sample(5)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
3386,360000.0,1,1,2,29,-1,-1,-1,-1,-1,...,969.0,2500.0,0.0,6159.0,1870.0,969.0,2500.0,0.0,0.0,0
13768,200000.0,2,2,1,26,0,0,0,0,0,...,49361.0,50564.0,51572.0,2800.0,2000.0,2000.0,2000.0,2000.0,2000.0,0
11606,460000.0,2,1,1,38,-1,-1,-1,-1,-1,...,8226.0,12497.0,4503.0,21833.0,13779.0,8226.0,12497.0,4503.0,5844.0,0
25726,160000.0,2,1,2,30,-2,-2,-2,-2,-1,...,1000.0,12234.0,11960.0,1000.0,1000.0,1000.0,12234.0,1200.0,65157.0,1
3488,30000.0,2,1,1,30,0,0,0,0,0,...,22692.0,24169.0,23782.0,2013.0,1440.0,800.0,1800.0,108.0,1130.0,0


In [7]:
diam_df.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
2208,0.7,Ideal,E,VS2,61.9,56.0,3142,5.69,5.72,3.53
1338,0.73,Premium,F,VS1,62.7,56.0,2961,5.75,5.73,3.6
43364,0.52,Ideal,H,VS1,61.4,56.0,1408,5.2,5.23,3.2
14748,1.14,Ideal,H,VS2,61.7,57.0,5937,6.73,6.68,4.14
34590,0.34,Ideal,J,VS1,61.9,53.9,469,4.5,4.53,2.79


In [8]:
adult_df.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
23354,56,Private,160932,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,44,United-States,>50K
39980,34,Private,184147,HS-grad,9,Separated,Sales,Unmarried,Black,Female,0,0,20,United-States,<=50K
23586,45,Private,170846,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
9859,51,State-gov,71691,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,60,United-States,>50K
282,23,Private,145917,HS-grad,9,Never-married,Handlers-cleaners,Not-in-family,White,Female,0,0,25,United-States,<=50K


In [9]:
house_df.shape

(21613, 21)

In [10]:
credit_df.shape

(30000, 24)

In [11]:
diam_df.shape

(53940, 10)

In [12]:
adult_df.shape

(48842, 15)

In [13]:
house_df.describe(include="all")

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,...,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
unique,,372,,,,,,,,,...,,,,,,,,,,
top,,20140623T000000,,,,,,,,,...,,,,,,,,,,
freq,,142,,,,,,,,,...,,,,,,,,,,
mean,4580302000.0,,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,...,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,...,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,...,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,...,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,...,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,...,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0


In [14]:
credit_df.describe(include="all")

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [15]:
diam_df.describe(include="all")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940,53940,53940,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
unique,,5,7,8,,,,,,
top,,Ideal,G,SI1,,,,,,
freq,,21551,11292,13065,,,,,,
mean,0.79794,,,,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,,,,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,,,,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,,,,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,,,,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,,,,62.5,59.0,5324.25,6.54,6.54,4.04


In [16]:
adult_df.describe(include="all")

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,48842,48842.0,48842,48842.0,48842,48842,48842,48842,48842,48842.0,48842.0,48842.0,48842,48842
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832,37155
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,,


In [17]:
credit_df.dtypes

LIMIT_BAL                     float64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
BILL_AMT1                     float64
BILL_AMT2                     float64
BILL_AMT3                     float64
BILL_AMT4                     float64
BILL_AMT5                     float64
BILL_AMT6                     float64
PAY_AMT1                      float64
PAY_AMT2                      float64
PAY_AMT3                      float64
PAY_AMT4                      float64
PAY_AMT5                      float64
PAY_AMT6                      float64
default.payment.next.month      int64
dtype: object

In [18]:
diam_df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [19]:
adult_df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

In [20]:
house_df.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [21]:
credit_df.isna().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

In [22]:
diam_df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [23]:
adult_df.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [24]:
adult_df['income'] = adult_df['income'].map({'<=50K': 0, '>50K': 1})
adult_df['gender'] = adult_df['gender'].map({'Male': 1, 'Female': 0})

### Метрики и функция разбиения выборок

In [25]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split

## Задача классификации

### Метрики для задачи __классификации__:
- `accuracy_score` (Отношение всех правильных предсказаний к размеру выборки)
- `roc_auc_score` (Area under ROC-curve)
- `f1_score` (Среднее гармоническое между `precision` и `recall`)  

### Accuracy
$$Accuracy = \frac{TP + TN}{TP + TN + FP + FN},$$  

* $TP$ - True Positive (Классификатор верно предсказал единицу)
* $TN$ - True Negative (Классификатор верно предсказал ноль)
* $FP$ - False Positive (Ошибка 1-го рода или $y_{pred} = 1$, хотя $y_{true} = 0$)
* $FN$ - False Negative (Ошибка 2-го рода или $y_{pred} = 0$, хотя $y_{true} = 1$)

### ROC AUC
ROC_AUC это площадь под ROC-кривой (Area under ROC-curve)  
ROC кривая - кривая в координатах `FPR` (False Positive Rate), `TPR` (True Positive Rate)  

###### True Positive Rate
$$TPR = \frac{TP}{TP + FN}$$

###### False Positive Rate
$$FPR = \frac{FP}{FP + TN}$$

![Alt text](https://pbs.twimg.com/media/C-MJ6SLXkAAwhV6.jpg "a title")

### F1 score (F-мера)
$$F1 =  \frac{2 * Precision * Recall}{Precision + Recall}$$

###### Precison
$$Precision = \frac{TP}{TP + FP}$$

###### Recall (или True Positive Rate)
$$Recall = \frac{TP}{TP + FN}$$

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

#### Датасет с доходами людей

In [27]:
X = pd.concat([adult_df[['age', 'capital-gain', 'capital-loss', 'fnlwgt', 'educational-num', 'gender', 'hours-per-week']], 
               pd.get_dummies(adult_df['workclass'], prefix='work'),
               pd.get_dummies(adult_df['education'], prefix='edu'),
               pd.get_dummies(adult_df['marital-status']),
               pd.get_dummies(adult_df['occupation'], prefix='job'),
               pd.get_dummies(adult_df['relationship']),
               pd.get_dummies(adult_df['race']),
               pd.get_dummies(adult_df['native-country'], prefix='nation')], axis=1)
y = adult_df['income']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [29]:
X_train.shape

(32724, 107)

In [30]:
X_test.shape

(16118, 107)

### Логистическая регрессия

In [31]:
%%time
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

CPU times: user 2.01 s, sys: 1.07 s, total: 3.08 s
Wall time: 445 ms


LogisticRegression()

In [32]:
%%time
log_reg_preds = log_reg.predict(X_test)

CPU times: user 77.5 ms, sys: 104 ms, total: 182 ms
Wall time: 26.5 ms


In [33]:
accuracy_score(y_test, log_reg_preds)

0.8040079414319394

In [34]:
roc_auc_score(y_test, log_reg_preds)

0.6185618960226571

In [35]:
f1_score(y_test, log_reg_preds)

0.3916811091854419

### K-ближайших соседей

In [36]:
%%time
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

CPU times: user 55.8 ms, sys: 86.5 ms, total: 142 ms
Wall time: 23.6 ms


KNeighborsClassifier()

In [37]:
%%time
knn_clf_preds = knn_clf.predict(X_test)

CPU times: user 21.5 s, sys: 49.1 s, total: 1min 10s
Wall time: 15.7 s


In [38]:
accuracy_score(y_test, knn_clf_preds)

0.7789427968730612

In [39]:
roc_auc_score(y_test, knn_clf_preds)

0.6232520823299612

In [40]:
f1_score(y_test, knn_clf_preds)

0.4121432106913051

### Метод опорных векторов

In [41]:
%%time
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

CPU times: user 55.2 s, sys: 593 ms, total: 55.8 s
Wall time: 55.9 s


SVC()

In [42]:
%%time
svm_clf_preds = svm_clf.predict(X_test)

CPU times: user 25.2 s, sys: 125 ms, total: 25.3 s
Wall time: 25.3 s


In [43]:
accuracy_score(y_test, svm_clf_preds)

0.8016503288249163

In [44]:
roc_auc_score(y_test, svm_clf_preds)

0.58165436679794

In [45]:
f1_score(y_test, svm_clf_preds)

0.2820570401976196

### Наивный байесовский классификатор

In [46]:
%%time
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

CPU times: user 65.8 ms, sys: 1.29 ms, total: 67 ms
Wall time: 64.9 ms


GaussianNB()

In [47]:
%%time
nb_clf_preds = nb_clf.predict(X_test)

CPU times: user 30 ms, sys: 1.2 ms, total: 31.2 ms
Wall time: 29.2 ms


In [48]:
accuracy_score(y_test, nb_clf_preds)

0.7999131405881623

In [49]:
roc_auc_score(y_test, nb_clf_preds)

0.6327011996386921

In [50]:
f1_score(y_test, nb_clf_preds)

0.42707408065375735

### Случайный лес

In [51]:
%%time
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

CPU times: user 4.14 s, sys: 22.4 ms, total: 4.17 s
Wall time: 4.17 s


RandomForestClassifier(random_state=42)

In [52]:
%%time
rf_clf_preds = rf_clf.predict(X_test)

CPU times: user 436 ms, sys: 2.77 ms, total: 439 ms
Wall time: 437 ms


In [53]:
accuracy_score(y_test, rf_clf_preds)

0.8624519171113041

In [54]:
roc_auc_score(y_test, rf_clf_preds)

0.7859000638798165

In [55]:
f1_score(y_test, rf_clf_preds)

0.687350162177408

### Градиентный бустинг

In [56]:
%%time
cat_clf = CatBoostClassifier(random_seed=42, verbose=0)
cat_clf.fit(X_train, y_train)

CPU times: user 27.2 s, sys: 2.77 s, total: 29.9 s
Wall time: 4.18 s


<catboost.core.CatBoostClassifier at 0x7f982e5b12b0>

In [57]:
%%time
cat_clf_preds = cat_clf.predict(X_test) 

CPU times: user 76.7 ms, sys: 10.9 ms, total: 87.6 ms
Wall time: 28.1 ms


In [58]:
accuracy_score(y_test, cat_clf_preds)

0.8792654175456012

In [59]:
roc_auc_score(y_test, cat_clf_preds)

0.8048113683179688

In [60]:
f1_score(y_test, cat_clf_preds)

0.7217615098655991

### Результаты классификации

In [61]:
clf_results = pd.DataFrame.from_dict({
    'classifier': ['Log-Reg', 'k-NN', 'SVM', 'Naive-Bayes', 'Random-Forest', 'Grad-Boost'],
    'accuracy_score': [accuracy_score(y_test, log_reg_preds),
                       accuracy_score(y_test, knn_clf_preds),
                       accuracy_score(y_test, svm_clf_preds),
                       accuracy_score(y_test, nb_clf_preds),
                       accuracy_score(y_test, rf_clf_preds),
                       accuracy_score(y_test, cat_clf_preds)],
    'roc_auc_score': [roc_auc_score(y_test, log_reg_preds),
                      roc_auc_score(y_test, knn_clf_preds),
                      roc_auc_score(y_test, svm_clf_preds),
                      roc_auc_score(y_test, nb_clf_preds),
                      roc_auc_score(y_test, rf_clf_preds),
                      roc_auc_score(y_test, cat_clf_preds)],
    'f1_score': [f1_score(y_test, log_reg_preds),
                 f1_score(y_test, knn_clf_preds),
                 f1_score(y_test, svm_clf_preds),
                 f1_score(y_test, nb_clf_preds),
                 f1_score(y_test, rf_clf_preds),
                 f1_score(y_test, cat_clf_preds)]
})

In [62]:
clf_results.round(3)

Unnamed: 0,classifier,accuracy_score,roc_auc_score,f1_score
0,Log-Reg,0.804,0.619,0.392
1,k-NN,0.779,0.623,0.412
2,SVM,0.802,0.582,0.282
3,Naive-Bayes,0.8,0.633,0.427
4,Random-Forest,0.862,0.786,0.687
5,Grad-Boost,0.879,0.805,0.722


#### Датасет с банкнотами

In [63]:
credit_df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000.0,1,3,1,39,0,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,150000.0,1,3,2,43,-1,-1,-1,-1,0,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,30000.0,1,2,2,37,4,3,2,-1,0,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,80000.0,1,3,1,41,1,-1,0,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [64]:
X = credit_df.drop('default.payment.next.month', axis=1)
y = credit_df['default.payment.next.month']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Логистическая регрессия

In [66]:
%%time
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

CPU times: user 620 ms, sys: 886 ms, total: 1.51 s
Wall time: 232 ms


LogisticRegression()

In [67]:
%%time
log_reg_preds = log_reg.predict(X_test)

CPU times: user 6.07 ms, sys: 15.1 ms, total: 21.2 ms
Wall time: 3.42 ms


In [68]:
accuracy_score(y_test, log_reg_preds)

0.7822222222222223

In [69]:
roc_auc_score(y_test, log_reg_preds)

0.5004633920296571

In [70]:
f1_score(y_test, log_reg_preds)

0.001851851851851852

### K-ближайших соседей

In [71]:
%%time
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

CPU times: user 5.04 ms, sys: 30.5 ms, total: 35.6 ms
Wall time: 7.21 ms


KNeighborsClassifier()

In [72]:
%%time
knn_clf_preds = knn_clf.predict(X_test)

CPU times: user 6.18 s, sys: 15.8 s, total: 22 s
Wall time: 5.72 s


In [73]:
accuracy_score(y_test, knn_clf_preds)

0.7515151515151515

In [74]:
roc_auc_score(y_test, knn_clf_preds)

0.5446674722258068

In [75]:
f1_score(y_test, knn_clf_preds)

0.23791821561338292

### Метод опорных векторов

In [76]:
%%time
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

CPU times: user 12.2 s, sys: 61.5 ms, total: 12.3 s
Wall time: 12.3 s


SVC()

In [77]:
%%time
svm_clf_preds = svm_clf.predict(X_test)

CPU times: user 5.32 s, sys: 16.5 ms, total: 5.34 s
Wall time: 5.34 s


In [78]:
accuracy_score(y_test, svm_clf_preds)

0.7820202020202021

In [79]:
roc_auc_score(y_test, svm_clf_preds)

0.5

In [80]:
f1_score(y_test, svm_clf_preds)

0.0

### Наивный байесовский классификатор

In [81]:
%%time
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

CPU times: user 12.5 ms, sys: 710 µs, total: 13.2 ms
Wall time: 11 ms


GaussianNB()

In [82]:
%%time
nb_clf_preds = nb_clf.predict(X_test)

CPU times: user 7.75 ms, sys: 0 ns, total: 7.75 ms
Wall time: 5.45 ms


In [83]:
accuracy_score(y_test, nb_clf_preds)

0.37747474747474746

In [84]:
roc_auc_score(y_test, nb_clf_preds)

0.5556858716785948

In [85]:
f1_score(y_test, nb_clf_preds)

0.3790428211586902

### Случайный лес

In [86]:
%%time
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

CPU times: user 5.22 s, sys: 21.1 ms, total: 5.24 s
Wall time: 5.25 s


RandomForestClassifier(random_state=42)

In [87]:
%%time
rf_clf_preds = rf_clf.predict(X_test)

CPU times: user 206 ms, sys: 2.15 ms, total: 208 ms
Wall time: 206 ms


In [88]:
accuracy_score(y_test, rf_clf_preds)

0.8176767676767677

In [89]:
roc_auc_score(y_test, rf_clf_preds)

0.6571567553124886

In [90]:
f1_score(y_test, rf_clf_preds)

0.47113975974216227

### Градиентный бустинг

In [91]:
%%time
cat_clf = CatBoostClassifier(random_seed=42, verbose=0)
cat_clf.fit(X_train, y_train)

CPU times: user 26.8 s, sys: 2.83 s, total: 29.6 s
Wall time: 4.15 s


<catboost.core.CatBoostClassifier at 0x7f982e5732e0>

In [92]:
%%time
cat_clf_preds = cat_clf.predict(X_test) 

CPU times: user 40.5 ms, sys: 2.76 ms, total: 43.2 ms
Wall time: 12.9 ms


In [93]:
accuracy_score(y_test, cat_clf_preds)

0.8207070707070707

In [94]:
roc_auc_score(y_test, cat_clf_preds)

0.6580915598486787

In [95]:
f1_score(y_test, cat_clf_preds)

0.473450014832394

### Результаты классификации

In [96]:
clf_results = pd.DataFrame.from_dict({
    'classifier': ['Log-Reg', 'k-NN', 'SVM', 'Naive-Bayes', 'Random-Forest', 'Grad-Boost'],
    'accuracy_score': [accuracy_score(y_test, log_reg_preds),
                       accuracy_score(y_test, knn_clf_preds),
                       accuracy_score(y_test, svm_clf_preds),
                       accuracy_score(y_test, nb_clf_preds),
                       accuracy_score(y_test, rf_clf_preds),
                       accuracy_score(y_test, cat_clf_preds)],
    'roc_auc_score': [roc_auc_score(y_test, log_reg_preds),
                      roc_auc_score(y_test, knn_clf_preds),
                      roc_auc_score(y_test, svm_clf_preds),
                      roc_auc_score(y_test, nb_clf_preds),
                      roc_auc_score(y_test, rf_clf_preds),
                      roc_auc_score(y_test, cat_clf_preds)],
    'f1_score': [f1_score(y_test, log_reg_preds),
                 f1_score(y_test, knn_clf_preds),
                 f1_score(y_test, svm_clf_preds),
                 f1_score(y_test, nb_clf_preds),
                 f1_score(y_test, rf_clf_preds),
                 f1_score(y_test, cat_clf_preds)]
})

In [97]:
clf_results.round(3)

Unnamed: 0,classifier,accuracy_score,roc_auc_score,f1_score
0,Log-Reg,0.782,0.5,0.002
1,k-NN,0.752,0.545,0.238
2,SVM,0.782,0.5,0.0
3,Naive-Bayes,0.377,0.556,0.379
4,Random-Forest,0.818,0.657,0.471
5,Grad-Boost,0.821,0.658,0.473


## Задача регресии

### Метрики для задачи __регрессии__:
- `Root mean squared error` (Квадратный корень из суммы квадратов ошибок, поделенной на размер выборки)
- `Mean absolute error` (Сумма модулей ошибок, поделенная на размер выборки)
- `$R^2$ (Коэффициент детерминации)

###### RMSE (Root Mean Squared Error)
$$RMSE = \sqrt{\frac{1}{n} * \sum_{i=1}^{n} (y_i - \hat{y_i})^2}$$

###### MAE (Mean Absolute Error)
$$MAE = \frac{1}{n} * \sum_{i=1}^{n} \lvert{y_i - \hat{y}\rvert}$$

###### R^2 (Коэффициент детерминации)
$$R^2 = \frac{\sum (\hat{y_i} - \bar{y})^2}{\sum (y_i - \bar{y})^2}$$

In [98]:
def rmse(y_true, y_pred):
    """
    Root mean squared error
    """
    return mean_squared_error(y_true, y_pred) ** (1/2)

In [99]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

#### Датасет с ценами домов

In [100]:
X = house_df.drop(['id', 'date', 'price'], axis=1)
y = house_df['price']

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [102]:
X_train.shape

(14480, 18)

In [103]:
X_test.shape

(7133, 18)

### Линейная регрессия

In [104]:
%%time
lr = LinearRegression()
lr.fit(X_train, y_train)

CPU times: user 14.3 ms, sys: 19.2 ms, total: 33.5 ms
Wall time: 18.9 ms


LinearRegression()

In [105]:
%%time
lr_preds = lr.predict(X_test)

CPU times: user 24.7 ms, sys: 43 ms, total: 67.7 ms
Wall time: 10.5 ms


In [106]:
rmse(y_test, lr_preds)

213134.00452624427

In [107]:
mean_absolute_error(y_test, lr_preds)

126908.676658051

In [108]:
r2_score(y_test, lr_preds)

0.6966159937240252

### K-ближайших соседей

In [109]:
%%time
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)

CPU times: user 0 ns, sys: 19.8 ms, total: 19.8 ms
Wall time: 2.78 ms


KNeighborsRegressor()

In [110]:
%%time
knn_reg_preds = knn_reg.predict(X_test)

CPU times: user 2.83 s, sys: 6.93 s, total: 9.77 s
Wall time: 2.71 s


In [111]:
rmse(y_test, knn_reg_preds)

280386.5898978201

In [112]:
mean_absolute_error(y_test, knn_reg_preds)

164370.81163605777

In [113]:
r2_score(y_test, knn_reg_preds)

0.47494878273087826

### Метод опорных векторов

In [114]:
%%time
svm_reg = SVR()
svm_reg.fit(X_train, y_train)

CPU times: user 8.77 s, sys: 51.4 ms, total: 8.83 s
Wall time: 8.83 s


SVR()

In [115]:
%%time
svm_reg_preds = svm_reg.predict(X_test)

CPU times: user 4.82 s, sys: 13.9 ms, total: 4.84 s
Wall time: 4.84 s


In [116]:
rmse(y_test, svm_reg_preds)

398365.58463666885

In [117]:
mean_absolute_error(y_test, svm_reg_preds)

227197.43181035056

In [118]:
r2_score(y_test, svm_reg_preds)

-0.05986564849943177

### Случайный лес

In [119]:
%%time
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

CPU times: user 10.4 s, sys: 25.1 ms, total: 10.4 s
Wall time: 10.4 s


RandomForestRegressor(random_state=42)

In [120]:
%%time
rf_reg_preds = rf_reg.predict(X_test)

CPU times: user 200 ms, sys: 780 µs, total: 201 ms
Wall time: 199 ms


In [121]:
rmse(y_test, rf_reg_preds)

148309.97295270977

In [122]:
mean_absolute_error(y_test, rf_reg_preds)

74717.87312561557

In [123]:
r2_score(y_test, rf_reg_preds)

0.8530979339996936

### Градиентный бустинг

In [124]:
%%time
cat_reg = CatBoostRegressor(random_seed=42, verbose=0)
cat_reg.fit(X_train, y_train)

CPU times: user 14.1 s, sys: 1.17 s, total: 15.3 s
Wall time: 2.24 s


<catboost.core.CatBoostRegressor at 0x7f982e5d53a0>

In [125]:
%%time
cat_reg_preds = cat_reg.predict(X_test)

CPU times: user 38.2 ms, sys: 876 µs, total: 39 ms
Wall time: 11 ms


In [126]:
rmse(y_test, cat_reg_preds)

123987.32561929805

In [127]:
mean_absolute_error(y_test, cat_reg_preds)

65331.75880429799

In [128]:
r2_score(y_test, cat_reg_preds)

0.8973304219461172

In [129]:
reg_results = pd.DataFrame.from_dict({
    'regression': ['Lin-Reg', 'k-NN', 'SVM', 'Random-Forest', 'Grad-Boost'],
    'RMSE': [rmse(y_test, lr_preds),
             rmse(y_test, knn_reg_preds),
             rmse(y_test, svm_reg_preds),
             rmse(y_test, rf_reg_preds),
             rmse(y_test, cat_reg_preds)],
    'MAE': [mean_absolute_error(y_test, lr_preds),
             mean_absolute_error(y_test, knn_reg_preds),
             mean_absolute_error(y_test, svm_reg_preds),
             mean_absolute_error(y_test, rf_reg_preds),
             mean_absolute_error(y_test, cat_reg_preds)],
    'R^2': [r2_score(y_test, lr_preds),
            r2_score(y_test, knn_reg_preds),
            r2_score(y_test, svm_reg_preds),
            r2_score(y_test, rf_reg_preds),
            r2_score(y_test, cat_reg_preds)]
})

In [130]:
reg_results.round(3)

Unnamed: 0,regression,RMSE,MAE,R^2
0,Lin-Reg,213134.005,126908.677,0.697
1,k-NN,280386.59,164370.812,0.475
2,SVM,398365.585,227197.432,-0.06
3,Random-Forest,148309.973,74717.873,0.853
4,Grad-Boost,123987.326,65331.759,0.897


#### Датасет с бриллиантами

In [131]:
diam_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [132]:
X = pd.concat([diam_df[['depth', 'table', 'x', 'y', 'z']],
               pd.get_dummies(diam_df['cut']),
               pd.get_dummies(diam_df['color']),
               pd.get_dummies(diam_df['clarity'])], axis=1)
y = diam_df['price']

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [134]:
X_train

Unnamed: 0,depth,table,x,y,z,Fair,Good,Ideal,Premium,Very Good,...,I,J,I1,IF,SI1,SI2,VS1,VS2,VVS1,VVS2
241,64.5,58.0,6.29,6.21,4.03,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
17398,61.4,57.0,4.41,4.38,2.70,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
36608,62.5,59.0,4.43,4.46,2.78,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
44731,61.8,55.0,5.30,5.34,3.29,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
18104,61.0,57.0,6.48,6.50,3.96,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,62.4,59.0,6.48,6.51,4.05,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
44732,61.0,55.0,5.03,5.01,3.06,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
38158,60.3,58.0,4.49,4.46,2.70,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
860,62.8,59.0,6.13,6.03,3.82,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0


In [135]:
y_train[:5]

241      2788
17398     612
36608     477
44731    1616
18104    7324
Name: price, dtype: int64

### Линейная регрессия

In [136]:
%%time
lr = LinearRegression()
lr.fit(X_train, y_train)

CPU times: user 45.1 ms, sys: 44.1 ms, total: 89.2 ms
Wall time: 27.6 ms


LinearRegression()

In [137]:
%%time
lr_preds = lr.predict(X_test)

CPU times: user 29.4 ms, sys: 53.5 ms, total: 82.9 ms
Wall time: 12.8 ms


In [138]:
rmse(y_test, lr_preds)

1589.1516823617717

In [139]:
mean_absolute_error(y_test, lr_preds)

1204.3856494511401

In [140]:
r2_score(y_test, lr_preds)

0.838247784063835

### K-ближайших соседей

In [141]:
%%time
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)

CPU times: user 22.6 ms, sys: 38.1 ms, total: 60.6 ms
Wall time: 9.37 ms


KNeighborsRegressor()

In [142]:
%%time
knn_reg_preds = knn_reg.predict(X_test)

CPU times: user 18.9 s, sys: 59.1 s, total: 1min 17s
Wall time: 19 s


In [143]:
rmse(y_test, knn_reg_preds)

987.9057206349817

In [144]:
mean_absolute_error(y_test, knn_reg_preds)

527.42804336835

In [145]:
r2_score(y_test, knn_reg_preds)

0.9374898507811031

### Метод опорных векторов

In [146]:
%%time
svm_reg = SVR()
svm_reg.fit(X_train, y_train)

CPU times: user 1min 43s, sys: 440 ms, total: 1min 44s
Wall time: 1min 44s


SVR()

In [147]:
%%time
svm_reg_preds = svm_reg.predict(X_test)

CPU times: user 35.2 s, sys: 96.4 ms, total: 35.3 s
Wall time: 35.4 s


In [148]:
rmse(y_test, svm_reg_preds)

4198.905854464755

In [149]:
mean_absolute_error(y_test, svm_reg_preds)

2748.9866472154135

In [150]:
r2_score(y_test, svm_reg_preds)

-0.1292544470359802

### Случайный лес

In [151]:
%%time
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)

CPU times: user 14.2 s, sys: 107 ms, total: 14.3 s
Wall time: 14.4 s


RandomForestRegressor(random_state=42)

In [152]:
%%time
rf_reg_preds = rf_reg.predict(X_test)

CPU times: user 480 ms, sys: 5.52 ms, total: 486 ms
Wall time: 483 ms


In [153]:
rmse(y_test, rf_reg_preds)

592.816460619321

In [154]:
mean_absolute_error(y_test, rf_reg_preds)

283.1482259949727

In [155]:
r2_score(y_test, rf_reg_preds)

0.977490800868683

### Градиентный бустинг

In [156]:
%%time
cat_reg = CatBoostRegressor(random_seed=42, verbose=0)
cat_reg.fit(X_train, y_train)

CPU times: user 17.4 s, sys: 1.26 s, total: 18.6 s
Wall time: 2.7 s


<catboost.core.CatBoostRegressor at 0x7f982e5d58e0>

In [157]:
%%time
cat_reg_preds = cat_reg.predict(X_test)

CPU times: user 57.2 ms, sys: 7.24 ms, total: 64.4 ms
Wall time: 15.4 ms


In [158]:
rmse(y_test, cat_reg_preds)

550.1572228538828

In [159]:
mean_absolute_error(y_test, cat_reg_preds)

286.77897818369075

In [160]:
r2_score(y_test, cat_reg_preds)

0.9806137783830206

In [161]:
reg_results = pd.DataFrame.from_dict({
    'regression': ['Lin-Reg', 'k-NN', 'SVM', 'Random-Forest', 'Grad-Boost'],
    'RMSE': [rmse(y_test, lr_preds),
             rmse(y_test, knn_reg_preds),
             rmse(y_test, svm_reg_preds),
             rmse(y_test, rf_reg_preds),
             rmse(y_test, cat_reg_preds)],
    'MAE': [mean_absolute_error(y_test, lr_preds),
             mean_absolute_error(y_test, knn_reg_preds),
             mean_absolute_error(y_test, svm_reg_preds),
             mean_absolute_error(y_test, rf_reg_preds),
             mean_absolute_error(y_test, cat_reg_preds)],
    'R^2': [r2_score(y_test, lr_preds),
            r2_score(y_test, knn_reg_preds),
            r2_score(y_test, svm_reg_preds),
            r2_score(y_test, rf_reg_preds),
            r2_score(y_test, cat_reg_preds)]
})

In [162]:
reg_results.round(3)

Unnamed: 0,regression,RMSE,MAE,R^2
0,Lin-Reg,1589.152,1204.386,0.838
1,k-NN,987.906,527.428,0.937
2,SVM,4198.906,2748.987,-0.129
3,Random-Forest,592.816,283.148,0.977
4,Grad-Boost,550.157,286.779,0.981
