In [1]:
#imports
import pandas as pd
import numpy as np
from  sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#imports sets
us_income_train = "datasets/cleaned/data_train.csv"
us_income_test = "datasets/cleaned/data_test.csv"
df = pd.read_csv(us_income_train)
df_test = pd.read_csv(us_income_test)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [4]:
df.income.value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [5]:
#splitting
X = df.drop("income",axis=1)
y = df.income.values

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2, test_size=0.3)

In [7]:
rfc = RandomForestClassifier()

In [8]:
rfc.fit(X_train,y_train)

RandomForestClassifier()

In [9]:
baseline_accuracy =rfc.score(X_test,y_test)

In [10]:
baseline_accuracy

0.8488074521445389

------------------------------------

## With accuracy as scoring

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
rfc.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [13]:
params = {
    "n_estimators" : [10,20,30,40,50,100],
    "criterion" : ["gini","entropy"],
    "max_depth" : [0,2,5,10],
    "min_samples_leaf" : [1,10,20]
    #'max_features' : np.arange(0.1,1,0.1).tolist()
}

In [14]:
rfc_2 = RandomForestClassifier()

In [15]:
rfc_roc = RandomForestClassifier()

In [16]:
# Setting up the grid search that will test every combination of parameters
gridsearch = GridSearchCV(estimator = rfc_2,
                        param_grid = params,
                        scoring = "accuracy",
                        cv = 5, # Use 5 folds
                        verbose = 4,
                        n_jobs = -1 #Use all but one CPU core
                        )
# Setting up the grid search that will test every combination of parameters
gridsearch_roc = GridSearchCV(estimator = rfc_roc,
                        param_grid = params,
                        scoring = "roc_auc",
                        cv = 5, # Use 5 folds
                        verbose = 4,
                        n_jobs = -1 #Use all but one CPU core
                        )


In [None]:
#Checking our gridsearch
result_roc = gridsearch_roc.fit(X_train, y_train)
result = gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  1.6min finished


Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    7.0s


In [None]:
#Getting best estimator
roc_best = result_roc.best_estimator_
accuracy_best = result.best_estimator_

In [None]:
score_with_roc_best = roc_best.score(X_test, y_test)
print("The generalization accuracy of the model is {:.2f}%".format(score * 100))

In [None]:
score_with_accuracy_best = accuracy_best.score(X_test, y_test)
print("The generalization accuracy of the model is {:.2f}%".format(score * 100))

-------------------------------------

# Validation

## Crosstab

In [None]:
pd.crosstab(y_test, y_pred, 
            rownames=["Actual"], 
            colnames=["Prediction"], margins=True)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred))

## Roc CURVE Evaluation

In [None]:
#define y_pred

In [None]:
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

In [None]:
fpr

In [None]:
tpr

In [None]:
thresholds

In [None]:
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
# calculate no skill roc curve 
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)

In [None]:
# plot the roc curve for the model   
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='RandomForestClassifier')
# axis labels
plt.xlabel('FALSE POSITIVE RATE')    
plt.ylabel('TRUE POSITIVE RATE')    
# show the legend                          
plt.legend()                                    
# show the plot                                 
plt.show() 

In [None]:

def roc_curve(y_test, y_pred_1,y_pred_2):
    # calculate roc curve and plot the different curves
    fpr_1, tpr_1, thresholds_1 = roc_curve(y_test, y_pred_1)
    fpr_2, tpr_2, thresholds_2 = roc_curve(y_test, y_pred_2)

    # generate a no skill prediction (majority class)
    ns_probs = [0 for _ in range(len(y_test))]
    # calculate no skill roc curve 
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    # plot the roc curve for the model   
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(fpr_1, tpr_1, marker='v', label='RandomForestClassifier_accuracy')
    plt.plot(fpr_2, tpr_2, marker='.', label='RandomForestClassifier_roc')

    # axis labels
    plt.xlabel('FALSE POSITIVE RATE')    
    plt.ylabel('TRUE POSITIVE RATE')    
    # show the legend                          
    plt.legend()                                    
    # show the plot                                 
    plt.show()
    return True                      #to get a return by convention

def score_roc_auc (y_test, y_pred_1, y_pred_2):
    # calculate scores
    ns_auc_1 = roc_auc_score(y_test, y_pred_1)
    ns_auc_2 = roc_auc_score(y_test, y_pred_2)
   
    return [ns_auc_1, ns_auc_2]      #to get a list instead of a tuple