In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn import ensemble

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

geo = pd.get_dummies(train["Geography"])
train = pd.concat([train, geo], axis = 1)

gen = OneHotEncoder(sparse = False)
train["Gender"] = gen.fit_transform(train[["Gender"]])

x = train[["CreditScore", "Gender", "Tenure", "Balance", "NumOfProducts",
           "HasCrCard", "IsActiveMember", "EstimatedSalary", "France", "Germany", "Spain"]].values
y = train["Exited"].values


geo_test = pd.get_dummies(test["Geography"])
test = pd.concat([test, geo_test], axis = 1)

gen_test = OneHotEncoder(sparse = False)
test["Gender"] = gen_test.fit_transform(test[["Gender"]])

x_test = test[["CreditScore", "Gender", "Tenure", "Balance", "NumOfProducts",
           "HasCrCard", "IsActiveMember", "EstimatedSalary", "France", "Germany", "Spain"]].values

X_train, X_test, Y_train, Y_test = train_test_split(x, y,
                                                    test_size = 0.25,
                                                    random_state = 1234,
                                                    stratify = y)

classifier = ensemble.RandomForestClassifier()
# n_estimators = 14, max_features = "auto", min_samples_leaf = 10

n_estimators = [x for x in np.linspace(1, 20, 20, dtype = int)]
max_features = ["auto", "sqrt"]
min_samples_leaf = [x for x in np.linspace(1, 10, 10, dtype = int)]

random_grid = {"n_estimators": n_estimators,
               "max_features": max_features,
               "min_samples_leaf": min_samples_leaf}

classifier_random = RandomizedSearchCV(classifier,
                                       param_distributions = random_grid,
                                       n_iter = 100,
                                       cv = 5)
classifier_random.fit(X_train, Y_train)
result = classifier_random.predict(X_test)

result = pd.Series(result, name = "Exited")
result = result.to_frame()

out = pd.concat([test[["RowNumber"]], result], axis = 1)
out.to_csv("random_forest.csv")

classifier_random.score(X_test, Y_test)

0.8225