In [1]:
#imports
import pandas as pd
import numpy as np
from  sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#imports sets
us_income_train = "datasets/cleaned/data_train.csv"
us_income_test = "datasets/cleaned/data_test.csv"
df = pd.read_csv(us_income_train)
df_test = pd.read_csv(us_income_test)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,0,77516,0,13,0,0,0,0,0,2174,0,40,0,0
1,50,1,83311,0,13,1,1,1,0,0,0,0,13,0,0
2,38,2,215646,1,9,2,2,0,0,0,0,0,40,0,0
3,53,2,234721,2,7,1,2,1,1,0,0,0,40,0,0
4,28,2,338409,0,13,1,3,2,1,1,0,0,40,1,0


In [4]:
df.income.value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [5]:
#splitting
X = df.drop("income",axis=1)
y = df.income.values

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2, test_size=0.3)

In [7]:
rfc = RandomForestClassifier()

In [8]:
rfc.fit(X_train,y_train)

RandomForestClassifier()

In [9]:
baseline_accuracy =rfc.score(X_test,y_test)

In [10]:
baseline_accuracy

0.8501381922407616

------------------------------------

## With accuracy as scoring

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
rfc.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [13]:
params = {
    "n_estimators" : [10,20,30,40,50,100],
    "criterion" : ["gini","entropy"],
    "max_depth" : [0,2,5,10],
    "min_samples_leaf" : [1,10,20],
    'max_features' : np.arange(0.1,1,0.1).tolist()
}

In [14]:
rfc_2 = RandomForestClassifier()

In [15]:
rfc_roc = RandomForestClassifier()

In [16]:
# Setting up the grid search that will test every combination of parameters
gridsearch = GridSearchCV(estimator = rfc_2,
                        param_grid = params,
                        scoring = "accuracy",
                        cv = 5, # Use 5 folds
                        verbose = 4,
                        n_jobs = 2 #Use all but one CPU core
                        )
# Setting up the grid search that will test every combination of parameters
gridsearch_roc = GridSearchCV(estimator = rfc_roc,
                        param_grid = params,
                        scoring = "roc_auc",
                        cv = 5, # Use 5 folds
                        verbose = 4,
                        n_jobs = 2 #Use all but one CPU core
                        )

# As we are doing cross-validation on the training set, the testing set X_test is untouched
result_roc = gridsearch_roc.fit(X_train, y_train)

# As we are doing cross-validation on the training set, the testing set X_test is untouched
result = gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  22 tasks      | elapsed:    2.0s
[Parallel(n_jobs=2)]: Done 308 tasks      | elapsed:    8.5s
[Parallel(n_jobs=2)]: Done 800 tasks      | elapsed:   21.7s
[Parallel(n_jobs=2)]: Done 1082 tasks      | elapsed:   57.1s
[Parallel(n_jobs=2)]: Done 1303 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 1572 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done 1891 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Done 2258 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done 2675 tasks      | elapsed: 10.5min
[Parallel(n_jobs=2)]: Done 3140 tasks      | elapsed: 17.0min
[Parallel(n_jobs=2)]: Done 4179 tasks      | elapsed: 19.8min
[Parallel(n_jobs=2)]: Done 4742 tasks      | elapsed: 22.6min
[Parallel(n_jobs=2)]: Done 5355 tasks      | elapsed: 25.3min
[Parallel(n_jobs=2)]: Done 6016 tasks      | elapsed: 29.4min
[Parallel(n_jobs=2)]: Done 6480 out of 6480 | elapsed: 36.8m

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  64 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done 356 tasks      | elapsed:    8.9s
[Parallel(n_jobs=2)]: Done 848 tasks      | elapsed:   25.5s
[Parallel(n_jobs=2)]: Done 1046 tasks      | elapsed:   57.1s
[Parallel(n_jobs=2)]: Done 1267 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done 1536 tasks      | elapsed:  2.9min
[Parallel(n_jobs=2)]: Done 1855 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done 2222 tasks      | elapsed:  6.6min
[Parallel(n_jobs=2)]: Done 2639 tasks      | elapsed:  9.3min
[Parallel(n_jobs=2)]: Done 3104 tasks      | elapsed: 13.7min
[Parallel(n_jobs=2)]: Done 4229 tasks      | elapsed: 17.0min
[Parallel(n_jobs=2)]: Done 4792 tasks      | elapsed: 19.9min
[Parallel(n_jobs=2)]: Done 5405 tasks      | elapsed: 24.2min
[Parallel(n_jobs=2)]: Done 6066 tasks      | elapsed: 31.5min
[Parallel(n_jobs=2)]: Done 6480 out of 6480 | elapsed: 36.7m

In [None]:
result.best_estimator_

In [18]:
result.best_score_

0.864250626157307

In [19]:
result.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 0.7000000000000001,
 'min_samples_leaf': 1,
 'n_estimators': 100}

In [20]:
rfc_new_params = RandomForestClassifier(criterion = 'gini',
                                         max_depth = 10,
                                         min_samples_leaf= 1,
                                         n_estimators=20)

In [21]:
rfc_new_params.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, n_estimators=20)

In [22]:
rfc_new_params.score(X_train,y_train)

0.8718409968409968

In [23]:
y_pred = rfc_new_params.predict(X_test)

In [24]:
accuracy_score(y_pred,y_test)

0.8519807554509161

## With roc as scoring

In [None]:
result_roc.best_estimator_

In [None]:
x = result_roc.best_estimator_
score = x.score(X_test, y_test)

In [None]:
score

In [None]:
result_roc.best_score_

In [None]:
result_roc.best_params_

In [None]:
rfc_with_roc = RandomForestClassifier(criterion = 'entropy',
                                         max_depth = 10,
                                         min_samples_leaf= 1,
                                         n_estimators=100)

In [None]:
rfc_with_roc.fit(X_train,y_train)

In [None]:
predict_roc = rfc_with_roc.predict(X_test)

In [None]:
accuracy_score(predict_roc,y_test)

# Crosstab

In [None]:
pd.crosstab(y_test, y_pred, 
            rownames=["Actual"], 
            colnames=["Prediction"], margins=True)

# Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred))

## test

In [None]:
X_s = df_test.drop("income",axis=1)
y_s = df_test.income.values

In [None]:
y_s_pred = rfc_new_params.predict(X_s)

In [None]:
print(classification_report(y_s, y_s_pred))

In [None]:
accuracy_score(y_s, y_s_pred)

In [None]:
X_s = df_test.drop("income",axis=1)
y_s = df_test.income.values

In [None]:
y_s_pred_roc = rfc_with_roc.predict(X_s)

In [None]:
print(classification_report(y_s, y_s_pred_roc))

In [None]:
accuracy_score(y_s, y_s_pred)