## Practice

### Q1, Q2

In [1]:
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [3]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((124, 13), (54, 13), (124,), (54,))

In [4]:
from sklearn.ensemble import AdaBoostClassifier
adc = AdaBoostClassifier(random_state=1)
adc.fit(X_train, y_train)

AdaBoostClassifier(random_state=1)

In [5]:
print(adc.score(X_train, y_train))
print(adc.score(X_test, y_test))

0.6774193548387096
0.5370370370370371


### Q3, Q4, Q5

In [6]:
from scipy.sparse.construct import random
parameter_grid = {'n_estimators':[100, 500, 1000], 'learning_rate': [0.5, 1, 2]}
from sklearn.model_selection import GridSearchCV
gcv = GridSearchCV(AdaBoostClassifier(random_state=1), param_grid=parameter_grid, cv=4,return_train_score=True)
gcv.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=AdaBoostClassifier(random_state=1),
             param_grid={'learning_rate': [0.5, 1, 2],
                         'n_estimators': [100, 500, 1000]},
             return_train_score=True)

In [7]:
gcv.score(X_test, y_test)

0.9629629629629629

In [8]:
gcv.best_estimator_

AdaBoostClassifier(learning_rate=0.5, n_estimators=100, random_state=1)

## Q6, Q7

In [9]:
def training_classifier(estimator, X, y):
  e = estimator(random_state=1)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
  e.fit(X_train, y_train)
  print(e.score(X_train, y_train))
  print(e.score(X_test, y_test))

In [10]:
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
training_classifier(BaggingClassifier, X, y)

1.0
0.9814814814814815


In [11]:
training_classifier(GradientBoostingClassifier, X, y)

1.0
0.9629629629629629


In [12]:
training_classifier(RandomForestClassifier, X, y)

1.0
0.9814814814814815


## Graded

In [13]:
import pandas as pd

In [14]:
cars = pd.read_csv('cars_data.csv')

In [15]:
cars.dropna(inplace=True)

In [16]:
cars.drop('Invoice', axis=1, inplace=True)

In [17]:
cars.columns

Index(['Make', 'Model', 'Type', 'Origin', 'DriveTrain', 'MSRP', 'EngineSize',
       'Cylinders', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight',
       'Wheelbase', 'Length'],
      dtype='object')

In [18]:
y = cars.pop('MSRP')

In [19]:
y = y.replace(to_replace='\$', value='', regex=True).replace(to_replace=',', value='', regex=True)

In [20]:
X = cars

In [21]:
X

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,3.5,6.0,225,18,24,3880,115,197
...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,Volvo,C70 LPT convertible 2dr,Sedan,Europe,Front,2.4,5.0,197,21,28,3450,105,186
424,Volvo,C70 HPT convertible 2dr,Sedan,Europe,Front,2.3,5.0,242,20,26,3450,105,186
425,Volvo,S80 T6 4dr,Sedan,Europe,Front,2.9,6.0,268,19,26,3653,110,190
426,Volvo,V40,Wagon,Europe,Front,1.9,4.0,170,22,29,2822,101,180


Try get_dummies on column 'Make'

In [22]:
arr = pd.get_dummies(cars['Make'])

In [23]:
# Check how is the encoding done, for the column 'Make'
for i in range(X.shape[0]):
  print(''.join(list(arr.iloc[i, :].astype(str))))

10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
0100000000000000000000000

Is get_dummies the same as onehotencoder?  Answer is Yes.  See below

In [24]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [25]:
arr = pd.DataFrame(OneHotEncoder(handle_unknown='ignore').fit_transform(cars['Make'].to_numpy().reshape(-1,1)).todense().astype(np.int64))

In [26]:
arr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
# Check how is the encoding done, for the column 'Make'
for i in range(X.shape[0]):
  print(''.join(list(arr.iloc[i, :].astype(str))))

10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
10000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
01000000000000000000000000000000000000
0100000000000000000000000

In [28]:
X = pd.get_dummies(X, columns=['Make','Model','Type','Origin', 'DriveTrain'])

In [29]:
X.shape

(426, 481)

In [30]:
def training_regressor(estimator, X, y):
  e = estimator(random_state=1)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
  e.fit(X_train, y_train)
  print(e.score(X_train, y_train))
  print(e.score(X_test, y_test))

In [31]:
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
training_regressor(BaggingRegressor, X, y)

0.9567015457407546
0.7949163237522345


In [32]:
training_regressor(RandomForestRegressor, X, y)

0.978182315656497
0.8369440882741959


In [33]:
training_regressor(GradientBoostingRegressor, X, y)

0.9841884742397355
0.8270485242007507


In [34]:
training_regressor(AdaBoostRegressor, X, y)

0.9008410947504886
0.711903287964231
