In [1081]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score,\
                            confusion_matrix, mean_absolute_error, mean_squared_error, r2_score,\
                            explained_variance_score
from scipy import stats

import warnings
warnings.simplefilter('ignore')
%matplotlib inline

# Laptop Prices

In [1082]:
data = pd.read_csv('laptops.csv', sep=',', encoding='latin-1')
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.shape

(1303, 12)

In [1085]:
data['Weight_kg'] = data['Weight'].apply(lambda x: float(x[:-2]))
data['Ram_GB'] = data['Ram'].apply(lambda x: float(x[:-2]))
data.drop(['Weight','Ram'], axis=1,inplace=True)

In [1086]:
def Get_Memory(string_memory, type_memory):
    memory = string_memory.split('+')
    value_memory=0    
    for m in memory:
        index_memory = m.find("GB "+type_memory)
        if index_memory!=-1:
            value_memory = m[:index_memory]
        
    return int(value_memory)

In [1087]:
data['SSD'] = data['Memory'].apply(lambda x: Get_Memory(x, 'SSD'))
data['HDD'] = data['Memory'].apply(lambda x: Get_Memory(x, 'HDD'))
data['Flash'] = data['Memory'].apply(lambda x: Get_Memory(x, 'Flash'))
data.drop('Memory',axis=1,inplace=True)

In [1088]:
def get_screen_resol(screen):
    resolution = screen.split()[-1]    
    width, height = resolution.split('x')     
    return float(width)*float(height)    

In [1089]:
#признак в виде произведения кол.пикселей по высоте на кол.пиксеоей по ширине
data['Resolution'] = data['ScreenResolution'].apply(get_screen_resol)

In [1090]:
def get_ghz(cpu):
    return float(cpu.split()[-1][:-3])

def get_cpu_series(cpu):
    cpu_split = cpu.split()[0:3]
        
    return ' '.join(cpu.split()[0:3])

In [1091]:
data['Cpu_Ghz'] = data['Cpu'].apply(get_ghz)
data['Cpu'] = data['Cpu'].apply(get_cpu_series)
data['Gpu'] = data['Gpu'].apply(get_cpu_series)

In [1092]:
# OneHotEncoding для типа ноутбука
ohe = pd.get_dummies(data['TypeName'])
data = data.join(ohe)

In [1099]:
data = data[data['Price_euros']<4000]

In [1108]:
data['Cpu_company'] = data['Cpu'].apply(lambda x: x.split()[0])

In [1113]:
data['Gpu_'] = data['Gpu'].apply(lambda gpu: gpu.split()[0])

In [1117]:
data.drop(['TypeName','Gpu_','ScreenResolution','Cpu_company'], axis=1,inplace=True)

In [1118]:
ohe = pd.get_dummies(data['OpSys'])
data = data.join(ohe)
data.drop('OpSys',axis=1,inplace=True)

In [1119]:
ohe = pd.get_dummies(data['Cpu'])
data = data.join(ohe)
data.drop('Cpu',axis=1,inplace=True)

In [1120]:
ohe = pd.get_dummies(data['Company'])
data = data.join(ohe)
data.drop('Company',axis=1,inplace=True)

In [1121]:
ohe = pd.get_dummies(data['Product'])
data = data.join(ohe)
data.drop('Product',axis=1,inplace=True)

In [1122]:
ohe = pd.get_dummies(data['Gpu'])
data = data.join(ohe)
data.drop('Gpu',axis=1,inplace=True)

In [1124]:
data_class = data.drop('Price_euros',axis=1)
target_class = data['Price_euros']>data['Price_euros'].mean()
target_class = target_class.apply(int)

Разделим выборку на train и test

In [1132]:
X_train, X_test, y_train, y_test = train_test_split(data_class,
                                                    target_class,
                                                    test_size = 0.2,
                                                    random_state=17,
                                                    shuffle=True)

In [1146]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [1147]:
scalar = StandardScaler()

X_train_scale_class = scalar.fit_transform(X_train)
X_test_scale_class = scalar.transform(X_test)

y_train_class = y_train
y_test_class = y_test

# Assignment 4.

In [1161]:
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [1162]:
X = data.drop('Price_euros',axis=1)
y = data['Price_euros']

In [1163]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2,random_state=17)

In [1164]:
scalar = StandardScaler()
X_train_scale = scalar.fit_transform(X_train)
X_test_scale = scalar.transform(X_test)

#### 4.1. BaggingRegressor

In [1165]:
# base estimator is a decision tree
br = BaggingRegressor().fit(X_train_scale, y_train)
br_pred = br.predict(X_test_scale)

#### 4.2. RandomForestRegressor

In [1166]:
rfr = RandomForestRegressor().fit(X_train_scale,y_train)
rfr_pred = rfr.predict(X_test_scale)

#### 4.3. Nearest Neighbors Regression

In [1167]:
knr = KNeighborsRegressor().fit(X_train_scale,y_train)
knr_pred = knr.predict(X_test_scale)

#### 4.4 Regression models

In [1168]:
ridge = Ridge(alpha=100).fit(X_train_scale, y_train)
ridge_pred=ridge.predict(X_test_scale)

In [1169]:
lasso = Lasso(alpha=0.5).fit(X_train_scale,y_train)
lasso_pred=lasso.predict(X_test_scale)

In [1170]:
lr = LinearRegression().fit(X_train_scale, y_train)
le_pred  = lr.predict(X_test_scale)

#### 4.5. Compare models accuracy

In [1171]:
print('BaggingRegressor acc =',mean_absolute_error(y_test, br_pred))
print('RandomForestRegressor acc =',mean_absolute_error(y_test, rfr_pred))
print('KNeighborsRegressor acc =',mean_absolute_error(y_test, knr_pred))
print('Lasso acc =',mean_absolute_error(y_test, lasso_pred))
print('Ridge acc =',mean_absolute_error(y_test, ridge_pred))
print('LinearRegression acc =',mean_absolute_error(y_test, le_pred))

BaggingRegressor acc = 184.65811227106227
RandomForestRegressor acc = 186.6887976923077
KNeighborsRegressor acc = 283.0591692307692
Lasso acc = 186.83926180960785
Ridge acc = 196.6015595909555
LinearRegression acc = 5351731366729691.0


In [1172]:
lr.coef_.sort()
lr.coef_[:7]

array([-9.86100243e+15, -9.57790540e+15, -9.48631702e+15, -8.50873295e+15,
       -7.19428332e+15, -6.95741743e+15, -6.95713141e+15])

LinearRegression переобучилась, об этом говорят веса, они очень большие по модулю. Скорее всего, это из-за не выполнения условий МНК, Ridge и Lasso компенисровали переобучение штрафом и избежали такого результата.

#### 4.6. Create an ensemble of models (?)

In [1173]:
comb_pred = (br_pred + rfr_pred + lasso_pred)/3

#### 4.7. Display different accuracy metrics

In [1174]:
mean_absolute_error(y_test, comb_pred)

169.5599959686433

In [1175]:
mean_squared_error(y_test,comb_pred)

72725.68276429357

In [1176]:
r2_score(y_test, comb_pred)

0.825604458531433

In [1177]:
explained_variance_score(y_test, comb_pred)

0.8273116757580057