# Exercise 11

## Car Price Prediction

Predict if the price of a car is low or high

In [14]:
%matplotlib inline
import pandas as pd
import numpy as np

data = pd.read_csv('https://github.com/albahnsen/PracticalMachineLearningClass/raw/master/datasets/dataTrain_carListings.zip')
data = data.loc[data['Model'].str.contains('Camry')].drop(['Make', 'State'], axis=1)
data = data.join(pd.get_dummies(data['Model'], prefix='M'))
data['HighPrice'] = (data['Price'] > data['Price'].mean()).astype(int)
data = data.drop(['Model', 'Price'], axis=1)

data.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE,HighPrice
15,2016,29242,0,0,0,0,1,0,0,1
47,2015,26465,0,0,0,0,1,0,0,1
85,2012,46739,0,1,0,0,0,0,0,1
141,2017,41722,0,0,0,0,0,1,0,1
226,2014,77669,0,0,0,0,0,0,1,0


In [16]:
data.shape

(13150, 10)

In [17]:
y = data['HighPrice']
X = data.drop(['HighPrice'], axis=1)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [127]:
X_train.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE
475678,2012,141137,1,0,0,0,0,0,0
23079,2015,13671,0,0,0,0,1,0,0
291651,2013,58614,0,0,0,0,0,1,0
277196,2011,47704,0,1,0,0,0,0,0
14923,2016,37504,0,0,0,0,1,0,0


# Exercise 11.1

Estimate a Decision Tree Classifier Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

<font color=blue>*Se traen aquí las funciones brindadas en el notebook #13*
- gini()
- gini_impurity()
- best_split()
- tree_grow()
- tree_predict()    
</font> 

In [128]:
def gini(y):
    if y.shape[0] == 0:
        return 0
    else:
        return 1 - (y.mean()**2 + (1 - y.mean())**2)

In [129]:
def gini_impurity(X_col, y, split):
    "Calculate the gain of an split k on feature j"
    
    filter_l = X_col < split
    y_l = y.loc[filter_l]
    y_r = y.loc[~filter_l]
    
    n_l = y_l.shape[0]
    n_r = y_r.shape[0]
    
    gini_y = gini(y)
    gini_l = gini(y_l)
    gini_r = gini(y_r)
    
    gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
    
    return gini_impurity_

In [130]:
def best_split(X, y, num_pct=10):
    
    features = range(X.shape[1])
    
    best_split = [0, 0, 0]  # j, split, gain
    
    # For all features
    for j in features:
        
        splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
        splits = np.unique(splits)[1:]
        
        # For all splits
        for split in splits:
            gain = gini_impurity(X.iloc[:, j], y, split)
                        
            if gain > best_split[2]:
                best_split = [j, split, gain]
    
    return best_split

In [131]:
def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10):
    
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # Calculate the best split
    
    j, split, gain = best_split(X, y, num_pct)
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    
    return tree

In [132]:
def tree_predict(X, tree, proba=False):
    
    predicted = np.ones(X.shape[0])

    # Check if final node
    if tree['split'] == -1:
        if not proba:
            predicted = predicted * tree['y_pred']
        else:
            predicted = predicted * tree['y_prob']
            
    else:
        
        j, split = tree['split']
        filter_l = (X.iloc[:, j] < split)
        X_l = X.loc[filter_l]
        X_r = X.loc[~filter_l]

        if X_l.shape[0] == 0:  # If left node is empty only continue with right
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
        elif X_r.shape[0] == 0:  # If right node is empty only continue with left
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
        else:
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)

    return predicted    

<font color=blue> 
Entrenamiento del arbol
</font> 

In [133]:
tree =tree_grow(X_train, y_train, level=0, min_gain=0.001, max_depth=3, num_pct=10)
tree

{'y_pred': 1,
 'y_prob': 0.5780753517930095,
 'level': 0,
 'split': [1, 52187.63636363637],
 'n_samples': 8810,
 'gain': 0.23872134898880762,
 'sl': {'y_pred': 1,
  'y_prob': 0.8391583452211127,
  'level': 1,
  'split': [0, 2014.0],
  'n_samples': 5606,
  'gain': 0.03317687167496233,
  'sl': {'y_pred': 0,
   'y_prob': 0.36828644501278773,
   'level': 2,
   'split': [0, 2012.0],
   'n_samples': 389,
   'gain': 0.05908490521197157,
   'sl': {'y_pred': 0,
    'y_prob': 0.08,
    'level': 3,
    'split': -1,
    'n_samples': 98,
    'gain': 0.01707452211653898},
   'sr': {'y_pred': 0,
    'y_prob': 0.46757679180887374,
    'level': 3,
    'split': -1,
    'n_samples': 291,
    'gain': 0.036947257392555666}},
  'sr': {'y_pred': 1,
   'y_prob': 0.8743054224947308,
   'level': 2,
   'split': [0, 2015.0],
   'n_samples': 5217,
   'gain': 0.014933378976312917,
   'sl': {'y_pred': 1,
    'y_prob': 0.7348484848484849,
    'level': 3,
    'split': -1,
    'n_samples': 1450,
    'gain': 0.024495220

In [134]:
y_pred=tree_predict(X_test, tree)
y_pred

array([0., 1., 1., ..., 1., 1., 0.])

<font color=blue>*Accuracy y F1 Score*    
</font> 

In [227]:
from sklearn import metrics
metrics.accuracy_score(y_pred, y_test),metrics.f1_score(y_pred, y_test)

(0.8682027649769585, 0.8909229595728452)

# Exercise 11.2

Estimate a Bagging of 10 Decision Tree Classifiers Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

<font color=blue>*Estimadores 10 y observaciones con reemplazamiento* 
</font> 

In [229]:

n_estimators = 10
# set a seed for reproducibility
np.random.seed(123)

n_samples = X_train.shape[0]

# create bootstrap samples (will be used to select rows from the DataFrame)
samples = [np.random.choice(a=n_samples, size=n_samples, replace=True) for _ in range(n_estimators)]
samples[1]

array([ 431, 3426, 8463, ..., 8218,  237, 8326])

In [137]:
np.random.seed(123) 
max_depth = np.random.randint(2, 7, size=n_estimators)
max_depth

array([4, 6, 4, 3, 5, 4, 5, 3, 3, 2])

In [138]:
trees = {}
for i in range(n_estimators):
    trees[i] =tree_grow(X_train.iloc[samples[i]], y_train.iloc[samples[i]], level=0, min_gain=0.001, max_depth=max_depth[i], num_pct=10)


In [139]:
y_pred_df= pd.DataFrame(index=X_test.index, columns=list(range(n_estimators)))
for i in range(n_estimators):
    y_pred_df.iloc[:, i] = tree_predict(X_test, trees[i])
    
y_pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
332784,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
146436,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
130476,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
85618,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<font color=blue>*Tenemos un Accuracy y F1 Score mayor pero similar que en el literal 11.1*
- One Decision Tree Classifier --> (0.8682027649769585, 0.8909229595728452)
- Bagging Decision Tree Classifier -->(0.871889400921659, 0.8948164964055998)
</font> 

In [140]:
y_pred = ( y_pred_df.sum(axis=1) >= (n_estimators / 2)).astype(np.int)
metrics.accuracy_score(y_pred, y_test),metrics.f1_score(y_pred, y_test)

(0.871889400921659, 0.8948164964055998)

# Exercise 11.3

Implement the variable max_features on the Decision Tree Classifier created in 11.1.

Compare the impact in the results by varing the parameter max_features

Evaluate the accuracy on the testing set

<font color=blue>*Validando como obtener un subconjunto de features de manera aleatoria del dataset*</font> 

In [141]:
train_random=X_train.sample(n=5, frac=None, replace=False, weights=None, random_state=None, axis=1)
train_random.head()

Unnamed: 0,M_CamryBase,M_Camry,M_CamrySE,M_Camry4dr,M_CamryL
475678,0,1,0,0,0
23079,0,0,0,0,0
291651,0,0,1,0,0
277196,0,0,0,1,0
14923,0,0,0,0,0


<font color=blue>[max_features] is the size of the random subsets of features to consider when splitting a node.

*Modifico la funcion por 'tree_grow_mxft' que va nodo por nodo construyendo el arbol, donde se seleccionan aleatoriamente 'max_features' variables explicativas de todas las variables y las usará para decidir cuál de éstas es mejor para hacer la partición. Cuando vaya al siguiente nodo, seleccionará aleatoriamente otras 'max_features' de todas las variables*. 
</font> 

In [162]:
def tree_grow_mxft(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10, max_features=None):
    
       
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # Calculate max features and the best split 
    Xrand=X.sample(n=max_features, frac=None, replace=False, weights=None, random_state=None, axis=1)
    j, split, gain = best_split(Xrand, y, num_pct)
    #print('max_features:',list(Xrand.columns.values),'-->Best:',Xrand.columns[j]) 
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   

    # No stooping criteria was meet, then continue to create the partition
    filter_l = Xrand.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow_mxft(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,max_features=max_features)
    tree['sr'] = tree_grow_mxft(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,max_features=max_features)
    
    return tree

<font color=blue>Valido que el recorrido del arbol, nodo por nodo, evalua un subconjunto aleatorio de variables y selecciona la mejor para dividr</font> 

In [145]:
tree_mxft =tree_grow_mxft(X_train, y_train, level=0, min_gain=0.001, max_depth=3, num_pct=10, max_features=5)
tree_mxft

max_features: ['Year', 'M_CamryBase', 'M_Camry4dr', 'M_Camry', 'M_CamryLE'] -->Best: Year
max_features: ['M_CamryLE', 'M_Camry', 'M_CamrySE', 'M_CamryBase', 'M_Camry4dr'] -->Best: M_Camry4dr
max_features: ['Year', 'M_CamrySE', 'M_CamryXLE', 'M_CamryL', 'M_CamryBase'] -->Best: Year
max_features: ['M_CamryL', 'M_CamryLE', 'M_Camry4dr', 'M_CamrySE', 'Year'] -->Best: M_CamryLE
max_features: ['M_Camry', 'M_CamryLE', 'Year', 'Mileage', 'M_CamryL'] -->Best: Mileage
max_features: ['Mileage', 'M_Camry4dr', 'M_Camry', 'M_CamrySE', 'Year'] -->Best: Mileage
max_features: ['M_CamrySE', 'M_CamryXLE', 'M_Camry', 'M_CamryL', 'M_Camry4dr'] -->Best: M_CamrySE
max_features: ['M_CamryLE', 'M_CamrySE', 'M_Camry4dr', 'M_CamryXLE', 'M_Camry'] -->Best: M_CamryLE
max_features: ['M_CamryXLE', 'M_CamryLE', 'M_CamryL', 'M_CamryBase', 'Year'] -->Best: Year
max_features: ['M_CamryBase', 'M_CamrySE', 'Year', 'M_Camry4dr', 'M_Camry'] -->Best: M_CamrySE
max_features: ['Year', 'M_CamryXLE', 'M_CamryBase', 'M_Camry', 'M

{'y_pred': 1,
 'y_prob': 0.5780753517930095,
 'level': 0,
 'split': [0, 2014.0],
 'n_samples': 8810,
 'gain': 0.2302237574807685,
 'sl': {'y_pred': 0,
  'y_prob': 0.08339247693399574,
  'level': 1,
  'split': [4, 1.0],
  'n_samples': 2816,
  'gain': 0.0031263197904484175,
  'sl': {'y_pred': 0,
   'y_prob': 0.12305516265912306,
   'level': 2,
   'split': [0, 2012.0],
   'n_samples': 1412,
   'gain': 0.030786354343971395,
   'sl': {'y_pred': 0,
    'y_prob': 0.01272264631043257,
    'level': 3,
    'split': -1,
    'n_samples': 784,
    'gain': 0.00015053143563278915},
   'sr': {'y_pred': 0,
    'y_prob': 0.2619047619047619,
    'level': 3,
    'split': -1,
    'n_samples': 628,
    'gain': 0.06086810446770108}},
  'sr': {'y_pred': 0,
   'y_prob': 0.044096728307254626,
   'level': 2,
   'split': [0, 50140.0],
   'n_samples': 1404,
   'gain': 0.008559552719096919,
   'sl': {'y_pred': 0,
    'y_prob': 0.25384615384615383,
    'level': 3,
    'split': -1,
    'n_samples': 128,
    'gain': 0

In [146]:
y_pred_mxft=tree_predict(X_test, tree_mxft)
y_pred_mxft

array([0., 1., 1., ..., 1., 1., 0.])

<font color=blue>*Tenemos un Accuracy y F1 Score menor pero similar que en el literal 11.1*
- One Decision Tree Classifier --> (0.8682027649769585, 0.8909229595728452)
- Bagging Decision Tree Classifier -->(0.871889400921659, 0.8948164964055998)
- Max_Feature=5 Decision Tree Classifier -->(0.8495391705069124, 0.8815956482320942)
</font> 

In [235]:
metrics.accuracy_score(y_pred_mxft, y_test),metrics.f1_score(y_pred_mxft, y_test)

(0.8495391705069124, 0.8815956482320942)

# Exercise 11.4

Estimate a Bagging of 10 Decision Tree Classifiers with `max_features = log(n_features)`

Evaluate the accuracy on the testing set

<font color=blue>Calculamos el logaritmo del número de variables explicativas y lo ingresamos como parámetro en la función modificada, que se itera por el número de estimadores</font> 

In [200]:
max_features_log2=round(np.log2(X_train.shape[1]),0).astype(int)
max_features_log2

3

In [201]:
trees_mxft = {}
for i in range(n_estimators):
    trees_mxft[i] =tree_grow_mxft(X_train.iloc[samples[i]], y_train.iloc[samples[i]], level=0, min_gain=0.001, max_depth=3, num_pct=10,max_features=max_features_log2 )


In [202]:
trees_mxft

{0: {'y_pred': 1,
  'y_prob': 0.5795506128007263,
  'level': 0,
  'split': [1, 51677.09090909091],
  'n_samples': 8810,
  'gain': 0.24026545942700211,
  'sl': {'y_pred': 1,
   'y_prob': 0.8414764621968617,
   'level': 1,
   'split': [1, 32410.63636363635],
   'n_samples': 5606,
   'gain': 0.0170053139197619,
   'sl': {'y_pred': 1,
    'y_prob': 0.9254901960784314,
    'level': 2,
    'split': [0, 22565.272727272724],
    'n_samples': 3058,
    'gain': 0.004449261804659321,
    'sl': {'y_pred': 1,
     'y_prob': 0.9682634730538923,
     'level': 3,
     'split': -1,
     'n_samples': 1668,
     'gain': 0.00011467454663698085},
    'sr': {'y_pred': 1,
     'y_prob': 0.8735632183908046,
     'level': 3,
     'split': -1,
     'n_samples': 1390,
     'gain': 0.028326514413888282}},
   'sr': {'y_pred': 1,
    'y_prob': 0.7403921568627451,
    'level': 2,
    'split': [0, 1.0],
    'n_samples': 2548,
    'gain': 0.006562787526482983,
    'sl': {'y_pred': 1,
     'y_prob': 0.772609819121447,


<font color=blue>Se crea un dataframe con las prediciones del número de estimadores.</font> 

In [203]:
y_pred_dfb= pd.DataFrame(index=X_test.index, columns=list(range(n_estimators)))
for i in range(n_estimators):
    y_pred_dfb.iloc[:, i] = tree_predict(X_test, trees_mxft[i])
    
y_pred_dfb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
332784,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
146436,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
130476,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
85618,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
75474,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


<font color=blue>*Tenemos un Accuracy y F1 Score menor pero se puede decir son valores buenos de clasificacion*
- One Decision Tree Classifier --> (0.8682027649769585, 0.8909229595728452)
- Bagging Decision Tree Classifier -->(0.871889400921659, 0.8948164964055998)
- Max_Feature=5 Decision Tree Classifier -->(0.8495391705069124, 0.8815956482320942)
- Bagging Max_Feature=log(n_features) Decision Tree Classifier -->(0.8013824884792626, 0.8525991792065662)
</font> 

In [204]:
y_pred_mxft_bag = ( y_pred_dfb.sum(axis=1) >= (n_estimators / 2)).astype(np.int)
metrics.accuracy_score(y_pred_mxft_bag, y_test),metrics.f1_score(y_pred_mxft_bag, y_test)

(0.8013824884792626, 0.8525991792065662)

# Exercise 11.5

Using sklearn, train a RandomForestClassifier

Evaluate the accuracy on the testing set

In [223]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(bootstrap=True,  max_depth=3,max_features="log2",
                        n_estimators=10, n_jobs=-1, min_impurity_decrease=0.001)


<font color=blue>*Tenemos un Accuracy y F1 Score no igual al RandomForest construido manualmente pero sí similar*
- Bagging Max_Feature=log2(n_features) Decision Tree Classifier -->(0.8013824884792626, 0.8525991792065662)
- RandomForestClassifier (max_depth=3,max_features="log2",n_estimators=10) -->(0.8654377880184332, 0.8893520272830617)
</font> 

In [224]:
clf.fit(X_train, y_train)
y_pred_rf = clf.predict(X_test)

metrics.accuracy_score(y_pred_rf, y_test),metrics.f1_score(y_pred_rf, y_test)

(0.8654377880184332, 0.8893520272830617)

# Exercise 11.6

Find the best parameters of the RandomForestClassifier (max_depth, max_features, n_estimators)

Evaluate the accuracy on the testing set

In [253]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(bootstrap=True,  max_depth=6,max_features=6,
                        n_estimators=200, n_jobs=-1)

In [254]:
clf.fit(X_train, y_train)
y_pred_rf = clf.predict(X_test)

metrics.accuracy_score(y_pred_rf, y_test),metrics.f1_score(y_pred_rf, y_test)

(0.881336405529954, 0.9013598927408543)

<font color=blue>*Tenemos un accuracy mayor:* 
- B- RandomForestClassifier (max_depth=3,max_features="log2",n_estimators=10) -->(0.8654377880184332, 0.8893520272830617)
- RandomForestClassifier (max_depth=6,n_estimators=200,max_features=6) -->(0.881336405529954, 0.9013598927408543)
</font> 

<font color=blue>*Se encuentra para el ejercicio que los parámetros que optimizan la clasificacion son:*
   - max_depth=6
   - n_estimators=200
   - max_features=6</font>

Por: Ana Milena Rodriguez G