# Exercise 11

## Car Price Prediction

Predict if the price of a car is low or high

In [1]:
%matplotlib inline
import pandas as pd

data = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTrain_carListings.zip')
data = data.loc[data['Model'].str.contains('Camry')].drop(['Make', 'State'], axis=1)
data = data.join(pd.get_dummies(data['Model'], prefix='M'))
data['HighPrice'] = (data['Price'] > data['Price'].mean()).astype(int)
data = data.drop(['Model', 'Price'], axis=1)
data.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE,HighPrice
15,2016,29242,0,0,0,0,1,0,0,1
47,2015,26465,0,0,0,0,1,0,0,1
85,2012,46739,0,1,0,0,0,0,0,1
141,2017,41722,0,0,0,0,0,1,0,1
226,2014,77669,0,0,0,0,0,0,1,0


In [2]:
data.shape

(13150, 10)

In [3]:
data.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE,HighPrice
15,2016,29242,0,0,0,0,1,0,0,1
47,2015,26465,0,0,0,0,1,0,0,1
85,2012,46739,0,1,0,0,0,0,0,1
141,2017,41722,0,0,0,0,0,1,0,1
226,2014,77669,0,0,0,0,0,0,1,0


In [4]:
y = data['HighPrice']
X = data.drop(['HighPrice'], axis=1)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Exercise 11.1

Estimate a Decision Tree Classifier Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [6]:
import numpy as np
max_depth = None
num_pct = 10
max_features = None
min_gain=0.001

In [7]:
j = 1
print(X.columns[j])

Mileage


In [8]:
# Split the variable in num_ctp points
splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / num_pct).tolist())
splits

array([5.00000e+00, 1.58728e+04, 2.32508e+04, 2.98747e+04, 3.56432e+04,
       4.16580e+04, 4.83404e+04, 6.16152e+04, 8.07292e+04, 1.06371e+05])

Split the data using split 5

In [9]:
k = 5

In [10]:
filter_l = X.iloc[:, j] < splits[k]

y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]

#### Gini 

The Gini Impurity of a node is the probability that a randomly chosen sample in a node would be incorrectly labeled if it was labeled by the distribution of samples in the node.

In [11]:
def gini(y):
    if y.shape[0] == 0:
        return 0
    else:
        return 1 - (y.mean()**2 + (1 - y.mean())**2)

In [12]:
gini_l = gini(y_l)
gini_l

0.20562506325087826

In [13]:
gini_r = gini(y_r)
gini_r

0.3991431537249346

#### Putting all in a function

In [14]:
def gini_impurity(X_col, y, split):
    "Calculate the gain of an split k on feature j"
    
    filter_l = X_col < split
    y_l = y.loc[filter_l]
    y_r = y.loc[~filter_l]
    
    n_l = y_l.shape[0]
    n_r = y_r.shape[0]
    
    gini_y = gini(y)
    gini_l = gini(y_l)
    gini_r = gini(y_r)
    
    gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
    
    return gini_impurity_

In [15]:
gini_impurity(X.iloc[:, j], y, splits[k])

0.18496148274516044

#### Test all splits on all features

In [16]:
def best_split(X, y, num_pct=10):
    
    features = range(X.shape[1])
    
    best_split = [0, 0, 0]  # j, split, gain
    
    # For all features
    for j in features:
        
        splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
        splits = np.unique(splits)[1:]
        
        # For all splits
        for split in splits:
            gain = gini_impurity(X.iloc[:, j], y, split)
                        
            if gain > best_split[2]:
                best_split = [j, split, gain]
    
    return best_split

In [17]:
j, split, gain = best_split(X, y, 5)
j, split, gain

(0, 2014.0, 0.23223870086324505)

In [18]:
filter_l = X.iloc[:, j] < split

y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]

In [19]:
y.shape[0], y_l.shape[0], y_r.shape[0]

(13150, 4169, 8981)

In [20]:
y.mean(), y_l.mean(), y_r.mean()

(0.5795437262357415, 0.07939553849844087, 0.8117136176372342)

### Recursively grow the tree 

In [21]:
def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10):
    
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # Calculate the best split
    j, split, gain = best_split(X, y, num_pct)
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=-1)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]
    tree['gain'] = gain

    # Next iteration to each split
    
    tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    
    return tree

In [41]:
tree = tree_grow(X, y, level=0, min_gain=0.001, max_depth=3, num_pct=10)

### Prediction

In [42]:
#Recorrer el árbol para ver dónde llegué
def tree_predict(X, tree, proba=False):
    
    predicted = np.ones(X.shape[0])

    # Check if final node
    if tree['split'] == -1:
        if not proba:
            predicted = predicted * tree['y_pred']
        else:
            predicted = predicted * tree['y_prob']
            
    else:
        
        j, split = tree['split']
        filter_l = (X.iloc[:, j] < split)
        X_l = X.loc[filter_l]
        X_r = X.loc[~filter_l]

        if X_l.shape[0] == 0:  # If left node is empty only continue with right
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
        elif X_r.shape[0] == 0:  # If right node is empty only continue with left
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
        else:
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)

    return predicted    

In [43]:
tree_predict(X, tree)

array([1., 1., 0., ..., 1., 1., 0.])

In [44]:
from sklearn import metrics
print('Accuracy:', round(metrics.accuracy_score(y,tree_predict(X,tree))*100,5), '%.')

Accuracy: 86.41825 %.


# Exercise 11.2

Estimate a Bagging of 10 Decision Tree Classifiers Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [31]:
n_estimators = 10
# set a seed for reproducibility
np.random.seed(123)

n_samples = X_train.shape[0]

# create bootstrap samples (will be used to select rows from the DataFrame)
samples = [np.random.choice(a=n_samples, size=n_samples, replace=True) for _ in range(n_estimators)]
samples[1]

array([ 431, 3426, 8463, ..., 8218,  237, 8326])

In [32]:
np.random.seed(123) 
max_depth = np.random.randint(2, 7, size=n_estimators)
max_depth

array([4, 6, 4, 3, 5, 4, 5, 3, 3, 2])

In [33]:
trees = {}
for i in range(n_estimators):
    trees[i] =tree_grow(X_train.iloc[samples[i]], y_train.iloc[samples[i]], level=0, min_gain=0.001, max_depth=max_depth[i], num_pct=10)


In [34]:
y_pred= pd.DataFrame(index=X_test.index, columns=list(range(n_estimators)))
for i in range(n_estimators):
    y_pred.iloc[:, i] = tree_predict(X_test, trees[i])
    
y_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
332784,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
146436,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
130476,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
85618,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
y_predict = ( y_pred.sum(axis=1) >= (n_estimators / 2)).astype(np.int)

print('Accuracy:', round(metrics.accuracy_score(y_predict, y_test)*100,5), '%.')

Accuracy: 87.18894 %.


# Exercise 11.3

Implement the variable max_features on the Decision Tree Classifier created in 11.1.

Compare the impact in the results by varing the parameter max_features

Evaluate the accuracy on the testing set

In this point is necessary to modify the tree_grow function defined above

In [38]:
def tree_grow_(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10, max_features=None):
    
       
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # Calculate max features
    Xsample=X.sample(n=max_features, frac=None, replace=False, weights=None, random_state=None, axis=1)
    j, split, gain = best_split(Xsample, y, num_pct)
    
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=-1)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   

    # No stooping criteria was meet, then continue to create the partition
    filter_l = Xsample.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow_(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,max_features=max_features)
    tree['sr'] = tree_grow_(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,max_features=max_features)
    
    return tree

In [46]:
tree_max =tree_grow_(X_train, y_train, level=0, min_gain=0.001, max_depth=3, num_pct=10, max_features=5)
y_pred_max=tree_predict(X_test, tree_max)
y_pred_max

array([0., 1., 1., ..., 1., 1., 0.])

In [48]:
print('Accuracy:', round(metrics.accuracy_score(y_pred_max, y_test)*100,5), '%.')

Accuracy: 84.95392 %.


Al modificar el número máximo de variables se disminuye la precisión de la estimación (84.95), comparado con aquellos valores encontrados en el ejercicio 11,1 (86.41) y 11,2 (87.18) 

# Exercise 11.4

Estimate a Bagging of 10 Decision Tree Classifiers with `max_features = log(n_features)`

Evaluate the accuracy on the testing set

In [60]:
max_features_log2=round(np.log2(X_train.shape[1]),0).astype(int)
max_features_log2

3

In [65]:
trees_max = {}
for i in range(n_estimators):
    trees_max[i] =tree_grow_(X_train.iloc[samples[i]], y_train.iloc[samples[i]], level=0, min_gain=0.001, max_depth=3, num_pct=10,max_features=max_features_log2 )


In [66]:
y_pred_fb= pd.DataFrame(index=X_test.index, columns=list(range(n_estimators)))
for i in range(n_estimators):
    y_pred_fb.iloc[:, i] = tree_predict(X_test, trees_max[i])
    
y_pred_fb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
332784,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
146436,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
130476,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
85618,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
75474,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0


In [68]:
y_pred_maxbag = (y_pred_fb.sum(axis=1) >= (n_estimators / 2)).astype(np.int)
print('Accuracy:', round(metrics.accuracy_score(y_pred_maxbag, y_test)*100,5), '%.')

Accuracy: 58.24885 %.


# Exercise 11.5

Using sklearn, train a RandomForestClassifier

Evaluate the accuracy on the testing set

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, random_state=1, n_jobs=-1)
clf.fit(X_train, y_train);
clf_predict = clf.predict(X_test)
accuracy_score = (cross_val_score(estimator = clf, X = X_train,
                             y = y_train, cv = 10, scoring='accuracy')).mean()


In [87]:
#print("Accuracy:"+" "+str(accuracy_score))
print('Accuracy:', round(accuracy_score(y_test,clf_predict)*100,5), '%.')

Accuracy: 83.7788 %.


# Exercise 11.6

Find the best parameters of the RandomForestClassifier (max_depth, max_features, n_estimators)

Evaluate the accuracy on the testing set

# Random search with cross validation

In [47]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['sqrt']
max_features.append(None)
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}

pprint(random_grid)

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['sqrt', None],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [48]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state=1)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='accuracy', 
                              cv = 3, verbose=2, random_state=1, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train);
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 18.8min finished


{'n_estimators': 1200, 'max_features': 'sqrt', 'max_depth': 10}

In [58]:
# Mean cross-validated score of the best_estimator.
print('Accuracy:', round(rf_random.best_score_*100, 5), '%.')

Accuracy: 88.00227 %.


# Grid search

We can now perform grid search building on the result from the random search. 
We will test a range of hyperparameters around the best values returned by random search. 

In [62]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [7, 8, 9, 11, 12, 13, 14, 15, 16],
    'max_features': [3, 4, 5, 6, 7, 8],
    'n_estimators': [1000, 1100, 1300]
}

# Create a base model
rf = RandomForestClassifier(random_state = 1)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True, scoring='accuracy')

In [63]:
# Fit the random search model
grid_search.fit(X_train, y_train);
grid_search.best_params_

Fitting 3 folds for each of 162 candidates, totalling 486 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 486 out of 486 | elapsed: 23.9min finished


{'max_depth': 7, 'max_features': 5, 'n_estimators': 1300}

In [64]:
# Mean cross-validated score of the best_estimator.
print('Accuracy:', round(grid_search.best_score_*100, 5), '%.')

Accuracy: 88.4109 %.


In [65]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_search.best_score_ - rf_random.best_score_) / rf_random.best_score_))

Improvement of 0.46%.


So, we will use the following model 

In [67]:
final_model = grid_search.best_estimator_

print('Final Model Parameters:\n')
pprint(final_model.get_params())

Final Model Parameters:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 7,
 'max_features': 5,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1300,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}


In [None]:
################