In [1081]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score,\
                            confusion_matrix, mean_absolute_error, mean_squared_error, r2_score,\
                            explained_variance_score
from scipy import stats

import warnings
warnings.simplefilter('ignore')
%matplotlib inline

# Laptop Prices

In [1082]:
data = pd.read_csv('laptops.csv', sep=',', encoding='latin-1')
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.shape

(1303, 12)

In [1085]:
data['Weight_kg'] = data['Weight'].apply(lambda x: float(x[:-2]))
data['Ram_GB'] = data['Ram'].apply(lambda x: float(x[:-2]))
data.drop(['Weight','Ram'], axis=1,inplace=True)

In [1086]:
def Get_Memory(string_memory, type_memory):
    memory = string_memory.split('+')
    value_memory=0    
    for m in memory:
        index_memory = m.find("GB "+type_memory)
        if index_memory!=-1:
            value_memory = m[:index_memory]
        
    return int(value_memory)

In [1087]:
data['SSD'] = data['Memory'].apply(lambda x: Get_Memory(x, 'SSD'))
data['HDD'] = data['Memory'].apply(lambda x: Get_Memory(x, 'HDD'))
data['Flash'] = data['Memory'].apply(lambda x: Get_Memory(x, 'Flash'))
data.drop('Memory',axis=1,inplace=True)

In [1088]:
def get_screen_resol(screen):
    resolution = screen.split()[-1]    
    width, height = resolution.split('x')     
    return float(width)*float(height)    

In [1089]:
#признак в виде произведения кол.пикселей по высоте на кол.пиксеоей по ширине
data['Resolution'] = data['ScreenResolution'].apply(get_screen_resol)

In [1090]:
def get_ghz(cpu):
    return float(cpu.split()[-1][:-3])

def get_cpu_series(cpu):
    cpu_split = cpu.split()[0:3]
        
    return ' '.join(cpu.split()[0:3])

In [1091]:
data['Cpu_Ghz'] = data['Cpu'].apply(get_ghz)
data['Cpu'] = data['Cpu'].apply(get_cpu_series)
data['Gpu'] = data['Gpu'].apply(get_cpu_series)

In [1092]:
# OneHotEncoding для типа ноутбука
ohe = pd.get_dummies(data['TypeName'])
data = data.join(ohe)

In [1099]:
data = data[data['Price_euros']<4000]

In [1108]:
data['Cpu_company'] = data['Cpu'].apply(lambda x: x.split()[0])

In [1113]:
data['Gpu_'] = data['Gpu'].apply(lambda gpu: gpu.split()[0])

#### OneHotEncoding для категориальных признаков

In [1117]:
data.drop(['TypeName','Gpu_','ScreenResolution','Cpu_company'], axis=1,inplace=True)

In [1118]:
ohe = pd.get_dummies(data['OpSys'])
data = data.join(ohe)
data.drop('OpSys',axis=1,inplace=True)

In [1119]:
ohe = pd.get_dummies(data['Cpu'])
data = data.join(ohe)
data.drop('Cpu',axis=1,inplace=True)

In [1120]:
ohe = pd.get_dummies(data['Company'])
data = data.join(ohe)
data.drop('Company',axis=1,inplace=True)

In [1121]:
ohe = pd.get_dummies(data['Product'])
data = data.join(ohe)
data.drop('Product',axis=1,inplace=True)

In [1122]:
ohe = pd.get_dummies(data['Gpu'])
data = data.join(ohe)
data.drop('Gpu',axis=1,inplace=True)

In [1124]:
data_class = data.drop('Price_euros',axis=1)
target_class = data['Price_euros']>data['Price_euros'].mean()
target_class = target_class.apply(int)

In [1132]:
X_train, X_test, y_train, y_test = train_test_split(data_class,
                                                    target_class,
                                                    test_size = 0.2,
                                                    random_state=17,
                                                    shuffle=True)

In [1147]:
scalar = StandardScaler()

X_train_scale_class = scalar.fit_transform(X_train)
X_test_scale_class = scalar.transform(X_test)

y_train_class = y_train
y_test_class = y_test

# Assignment 5.

In [1178]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.ensemble import AdaBoostClassifier

In [1179]:
predictions=[]

#### 5.1. AdaBoostClassifier

In [1180]:
ab = AdaBoostClassifier(learning_rate=0.05)

In [1181]:
ab.fit(X_train_scale_class, y_train_class)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=50, random_state=None)

In [1182]:
ab_pred = ab.predict(X_test_scale_class)
predictions.append(('AdaBoostClassifier',ab_pred))

#### 5.2. Xgboost

In [1183]:
xgb_model = xgb.XGBClassifier(
                    max_depth=10,
                    learning_rate=0.1,
                    n_estimators=100,
)

In [1184]:
xgb_model.fit(X_train_scale_class, y_train_class)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [1185]:
xgb_pred = xgb_model.predict(X_test_scale_class)
predictions.append(('XGBClassifier',xgb_pred))

#### 5.3. Lightgbm

In [1186]:
bst = lgb.LGBMClassifier(max_depth=10, learning_rate=0.1)

bst.fit(X_train_scale_class,y_train_class)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=10,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [1187]:
lgb_pred = bst.predict(X_test_scale_class)
predictions.append(('Lightgbm', lgb_pred))

#### 5.4. CatBoost

In [1188]:
cb_model = cb.CatBoostClassifier(depth=10, learning_rate=0.1, n_estimators=100)

In [1189]:
cb_model.fit(X_train_scale_class, y_train_class)

0:	learn: 0.6039074	total: 183ms	remaining: 18.1s
1:	learn: 0.5433950	total: 390ms	remaining: 19.1s
2:	learn: 0.5045931	total: 639ms	remaining: 20.7s
3:	learn: 0.4690543	total: 779ms	remaining: 18.7s
4:	learn: 0.4460179	total: 984ms	remaining: 18.7s
5:	learn: 0.4205295	total: 1.17s	remaining: 18.3s
6:	learn: 0.4085795	total: 1.34s	remaining: 17.8s
7:	learn: 0.3896932	total: 1.51s	remaining: 17.4s
8:	learn: 0.3748641	total: 1.69s	remaining: 17.1s
9:	learn: 0.3638789	total: 1.82s	remaining: 16.4s
10:	learn: 0.3499962	total: 1.98s	remaining: 16s
11:	learn: 0.3418443	total: 2.11s	remaining: 15.5s
12:	learn: 0.3314468	total: 2.27s	remaining: 15.2s
13:	learn: 0.3251211	total: 2.44s	remaining: 15s
14:	learn: 0.3208362	total: 2.58s	remaining: 14.6s
15:	learn: 0.3151775	total: 2.73s	remaining: 14.3s
16:	learn: 0.3109645	total: 2.89s	remaining: 14.1s
17:	learn: 0.3079371	total: 3.02s	remaining: 13.8s
18:	learn: 0.3038725	total: 3.2s	remaining: 13.7s
19:	learn: 0.2990453	total: 3.36s	remaining: 1

<catboost.core.CatBoostClassifier at 0x7f6ba8e754a8>

In [1190]:
cb_pred = cb_model.predict(X_test_scale_class)
predictions.append(('CatBoost',lgb_pred))

In [1191]:
for name_model, pred_model in predictions:
    print(name_model, 'accuracy #', accuracy_score(y_test_class, pred_model))

AdaBoostClassifier accuracy # 0.823076923076923
XGBClassifier accuracy # 0.8769230769230769
Lightgbm accuracy # 0.8769230769230769
CatBoost accuracy # 0.8769230769230769


Алгоритмы XGBClassifier, Lightgbm, CatBoost показали идентичные accuracy, но эти модели имели одинаковые параметры и не были оптимально настроены.

#### Обучим алгоримты с различными параметрами

In [1192]:
bst = lgb.LGBMClassifier(max_depth=15, learning_rate=0.01)
bst.fit(X_train_scale_class,y_train_class)
predictions.append(('LGBMClassifier(max_depth=15, lr=0.01)', bst.predict(X_test_scale_class)))

In [1193]:
xgb_model = xgb.XGBClassifier(
                    max_depth=5,
                    learning_rate=0.1,
                    n_estimators=100,
)
xgb_model.fit(X_train_scale_class, y_train_class)
xgb_pred = xgb_model.predict(X_test_scale_class)
predictions.append(('XGBClassifier(max_depth=5, lr=0.1)',xgb_pred))

In [1194]:
cb_model = cb.CatBoostClassifier(depth=2, learning_rate=0.5, n_estimators=5)
cb_model.fit(X_train_scale_class, y_train_class)
cb_pred = cb_model.predict(X_test_scale_class)
predictions.append(('CatBoost(depth=2, lr=0.5, n_estim = 5)',cb_pred))

0:	learn: 0.5581426	total: 11.3ms	remaining: 45.2ms
1:	learn: 0.4505916	total: 21.2ms	remaining: 31.8ms
2:	learn: 0.4174039	total: 30.6ms	remaining: 20.4ms
3:	learn: 0.3775601	total: 42.2ms	remaining: 10.6ms
4:	learn: 0.3559214	total: 51.3ms	remaining: 0us


In [1195]:
for name_model, pred_model in predictions:
    print(name_model, 'accuracy #', accuracy_score(y_test_class, pred_model))

AdaBoostClassifier accuracy # 0.823076923076923
XGBClassifier accuracy # 0.8769230769230769
Lightgbm accuracy # 0.8769230769230769
CatBoost accuracy # 0.8769230769230769
LGBMClassifier(max_depth=15, lr=0.01) accuracy # 0.8538461538461538
XGBClassifier(max_depth=5, lr=0.1) accuracy # 0.8923076923076924
CatBoost(depth=2, lr=0.5, n_estim = 5) accuracy # 0.823076923076923
