In [15]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error,make_scorer

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [16]:
zf = zipfile.ZipFile("playground-series-s4e1.zip")
zf.namelist()

['sample_submission.csv', 'test.csv', 'train.csv']

In [17]:
df_train = pd.read_csv(zf.open("train.csv"))
df_test = pd.read_csv(zf.open("test.csv"))

In [18]:
#Checking to see if any rows/columns are missing any data
df_train.head(5)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [19]:
#Checking if any columns have words that need to be converted to integers/floats for prediction calculations

df_train.drop(['Surname', 'id', 'CustomerId'], axis =1, inplace=True)
df_train.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [20]:
df_dummy_train = pd.get_dummies(df_train, drop_first= False, dtype=float)

df_dummy_train

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,668,33.0,3,0.00,2,1.0,0.0,181449.97,0,1.0,0.0,0.0,0.0,1.0
1,627,33.0,1,0.00,2,1.0,1.0,49503.50,0,1.0,0.0,0.0,0.0,1.0
2,678,40.0,10,0.00,2,1.0,0.0,184866.69,0,1.0,0.0,0.0,0.0,1.0
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1.0,0.0,0.0,0.0,1.0
4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,667,33.0,2,0.00,1,1.0,1.0,131834.75,0,0.0,0.0,1.0,1.0,0.0
165030,792,35.0,3,0.00,1,0.0,0.0,131834.45,0,1.0,0.0,0.0,0.0,1.0
165031,565,31.0,5,0.00,1,1.0,1.0,127429.56,0,1.0,0.0,0.0,0.0,1.0
165032,554,30.0,7,161533.00,1,0.0,1.0,71173.03,0,0.0,0.0,1.0,1.0,0.0


In [21]:
y = df_dummy_train['Exited']
x = df_dummy_train.drop(['Exited'], axis =1)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y, stratify=y , random_state=42)

In [30]:
seed=2
def objective(params):
    est=int(params['n_estimators'])
    md=int(params['max_depth'])
    msl=int(params['min_samples_leaf'])
    mss=int(params['min_samples_split'])
    model=RandomForestClassifier(n_estimators=est,max_depth=md,min_samples_leaf=msl,min_samples_split=mss)
    model.fit(x_train,y_train)
    pred=model.predict(x_test)
    score=mean_squared_error(y_test,pred)
    return score

def optimize(trial):
    params={'n_estimators':hp.uniform('n_estimators',100,500),
           'max_depth':hp.uniform('max_depth',5,20),
           'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
           'min_samples_split':hp.uniform('min_samples_split',2,6)}
    best=fmin(fn=objective,space=params,algo=tpe.suggest,trials=trial,max_evals=10)
    return best

In [31]:
trials = Trials()

best_hyperparams = optimize(trials)

100%|██████████| 10/10 [06:23<00:00, 38.30s/trial, best loss: 0.13609151942606462]


In [32]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'max_depth': 16.018799418319425, 'min_samples_leaf': 4.141101753516019, 'min_samples_split': 3.0546965623891063, 'n_estimators': 385.4832179632076}


In [37]:
model = RandomForestClassifier(max_depth = 16,
                               min_samples_leaf = 4,
                               min_samples_split = 3,
                               n_estimators =385,
                                random_state=25 )


In [38]:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_squared_error(y_test, y_pred,squared=False))
print(model.score(x_train,y_train))

0.36956225747343907
0.900432235911937


In [42]:
#Check feature Importance
importances = model.feature_importances_
columns = x.columns
i = 0

while i< len(columns):
    print (f" The importance of feature '{columns[i]}' is {round(importances[i]*100, 2)}%.")
    i += 1

 The importance of feature 'CreditScore' is 7.52%.
 The importance of feature 'Age' is 30.44%.
 The importance of feature 'Tenure' is 3.81%.
 The importance of feature 'Balance' is 9.33%.
 The importance of feature 'NumOfProducts' is 23.43%.
 The importance of feature 'HasCrCard' is 0.77%.
 The importance of feature 'IsActiveMember' is 7.8%.
 The importance of feature 'EstimatedSalary' is 7.91%.
 The importance of feature 'Geography_France' is 1.12%.
 The importance of feature 'Geography_Germany' is 4.05%.
 The importance of feature 'Geography_Spain' is 0.64%.
 The importance of feature 'Gender_Female' is 1.64%.
 The importance of feature 'Gender_Male' is 1.53%.


In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.86

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.92     32529
           1       0.75      0.53      0.62      8730

    accuracy                           0.86     41259
   macro avg       0.82      0.74      0.77     41259
weighted avg       0.86      0.86      0.85     41259



In [40]:

df_test.drop(['id', 'CustomerId', 'Surname'], axis = 1, inplace = True)

df_dummy_test = pd.get_dummies(df_test, drop_first= False, dtype=float)
df_dummy_test

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,586,23.0,2,0.00,2,0.0,1.0,160976.75,1.0,0.0,0.0,1.0,0.0
1,683,46.0,2,0.00,1,1.0,0.0,72549.27,1.0,0.0,0.0,1.0,0.0
2,656,34.0,7,0.00,2,1.0,0.0,138882.09,1.0,0.0,0.0,1.0,0.0
3,681,36.0,8,0.00,1,1.0,0.0,113931.57,1.0,0.0,0.0,0.0,1.0
4,752,38.0,10,121263.62,1,1.0,0.0,139431.00,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,570,29.0,7,116099.82,1,1.0,1.0,148087.62,0.0,0.0,1.0,0.0,1.0
110019,575,36.0,4,178032.53,1,1.0,1.0,42181.68,1.0,0.0,0.0,1.0,0.0
110020,712,31.0,2,0.00,2,1.0,0.0,16287.38,1.0,0.0,0.0,0.0,1.0
110021,709,32.0,3,0.00,1,1.0,1.0,158816.58,1.0,0.0,0.0,1.0,0.0


In [41]:
y_test_pred = model.predict(df_dummy_test)
y_test_prob = model.predict_proba(df_dummy_test)[:, 1]

y_test_pred
y_test_prob = y_test_prob.round(1)




In [43]:
index = pd.read_csv(zf.open("test.csv"))

In [44]:
df_submission = pd.DataFrame({
    'id': index['id'],
    'Exited': y_test_prob
})
df_submission

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,0.9
2,165036,0.0
3,165037,0.2
4,165038,0.3
...,...,...
110018,275052,0.1
110019,275053,0.1
110020,275054,0.0
110021,275055,0.2


In [45]:
df_submission.to_csv('Submission_Sklearn_RF_Hypertuned.csv', index=False)