# Comparison of Different Classification Methods
## Topic: How many Customers Stay

### Table of Content:
1. [Data Gathering](#first_part)
2. [Data Exploration](#second_part)
3. [Data Preprocessing](#third_part)
4. [Model Building](#fourth_part)
5. [Evaluation](#fifth_part)

In [1]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from collections import Counter
from imblearn.over_sampling import SMOTE

## 1. Data Gathering <a name="first_part"></a>

In [2]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15682273,Burns,683,France,Female,38,5,127616.56,1,1,0,123846.07,0
1,1,15694321,Su,619,France,Female,28,3,0.0,2,1,0,53394.12,0
2,2,15807194,Iweobiegbulam,718,Spain,Male,34,5,113922.44,2,1,0,30772.22,0
3,3,15643966,Goforth,616,Germany,Male,45,3,143129.41,2,0,1,64327.26,0
4,4,15697686,Stewart,787,France,Female,40,6,0.0,2,1,1,84151.98,0


## 2. Data Exploration <a name="second_part"></a>

In [3]:
print(f'Dimension of train dataset: {train_data.shape}\n')

print(f'Missing values: \n{train_data.isnull().sum()}')

Dimension of train dataset: (7500, 14)

Missing values: 
RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


## 3. Data Preprocessing <a name="third_part"></a>

In [4]:
# Cast training data into x(features) and y(label)
x = train_data.drop(["Exited"], axis=1)
y = train_data["Exited"]

print(f'Dimension of train dataset [features]: {x.shape}')
print(f'Dimension of test dataset [label]: {y.shape}')

Dimension of train dataset [features]: (7500, 13)
Dimension of test dataset [label]: (7500,)


In [5]:
# Remove useless features in x
x = x.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
print(x.shape)

(7500, 10)


In [6]:
print(type(x))
x.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,683,France,Female,38,5,127616.56,1,1,0,123846.07
1,619,France,Female,28,3,0.0,2,1,0,53394.12
2,718,Spain,Male,34,5,113922.44,2,1,0,30772.22
3,616,Germany,Male,45,3,143129.41,2,0,1,64327.26
4,787,France,Female,40,6,0.0,2,1,1,84151.98


In [7]:
# Process columns including Geography and Gender to number values using onehot encoder.

geo_label_model = LabelEncoder().fit(x["Geography"])
geo_label = geo_label_model.transform(x["Geography"]).reshape(-1,1)
geo_model = OneHotEncoder()
geo_model.fit(geo_label)
oh_geo = geo_model.transform(geo_label).toarray()

gender_label_model = LabelEncoder().fit(x["Gender"])
gender_label = gender_label_model.transform(x["Gender"]).reshape(-1,1)
gender_model = OneHotEncoder()
gender_model.fit(gender_label)
oh_gender = gender_model.transform(gender_label).toarray()


In [8]:
## Scale columns: CreditScore, Age, Balance, Tenure, NumOfProducts, EstimatedSalary.
from sklearn.preprocessing import MinMaxScaler, StandardScaler

credit_data = np.array(x["CreditScore"]).reshape(-1,1).astype("float64")
credit_model = StandardScaler().fit(credit_data)
credit_score_mm = credit_model.transform(credit_data)

age_data = np.array(x["Age"]).reshape(-1,1).astype("float64")
age_model = StandardScaler().fit(age_data)
age_mm = age_model.transform(age_data)

tenure_data = np.array(x["Tenure"]).reshape(-1,1).astype("float64")
tenure_model = StandardScaler().fit(tenure_data)
tenure_mm = tenure_model.transform(tenure_data)

balance_data = np.array(x["Balance"]).reshape(-1,1).astype("float64")
balance_model = StandardScaler().fit(balance_data)
balance_mm = balance_model.transform(balance_data)

num_of_product_data = np.array(x["NumOfProducts"]).reshape(-1,1).astype("float64")
num_of_product_model = StandardScaler().fit(num_of_product_data)
num_of_product_mm = num_of_product_model.transform(num_of_product_data)

estimated_salary_data = np.array(x["EstimatedSalary"]).reshape(-1,1).astype("float64")
estimated_salary_model = StandardScaler().fit(estimated_salary_data)
estimated_salary_mm = estimated_salary_model.transform(estimated_salary_data)

In [9]:
# Concat all the processed features in training set.

final_feature = np.hstack((credit_score_mm, oh_geo, oh_gender, age_mm, tenure_mm, balance_mm, num_of_product_mm, 
           np.array(x["HasCrCard"]).reshape(-1,1), np.array(x["IsActiveMember"]).reshape(-1,1), estimated_salary_mm))
final_feature.shape

(7500, 13)

In [10]:
# Feature selection
gbdt = GradientBoostingClassifier()
gbdt.fit(final_feature, y)

GradientBoostingClassifier()

In [11]:
gbdt.feature_importances_

array([1.90765914e-02, 1.11246631e-03, 5.59554073e-02, 4.52107397e-04,
       5.97462696e-03, 9.60120716e-03, 3.91916856e-01, 4.01205766e-03,
       8.03942246e-02, 2.98616624e-01, 3.00640584e-04, 1.15353508e-01,
       1.72336824e-02])

In [12]:
# Select importance >= 0.01
select_final_feature = final_feature[:, gbdt.feature_importances_>0.01]
select_final_feature.shape

(7500, 7)

In [13]:
Counter(y)

Counter({0: 5941, 1: 1559})

In [14]:
smo = SMOTE()
X_smo, y_smo = smo.fit_resample(final_feature, y)

In [15]:
Counter(y_smo)

Counter({0: 5941, 1: 5941})

## 4. Model Building <a name="fourth_part"></a>

In [16]:
np.mean(cross_val_score(GaussianNB(), X_smo, y_smo, cv=10, scoring="f1"))

0.7114059589114149

In [17]:
np.mean(cross_val_score(LogisticRegression(), X_smo, y_smo, cv=10, scoring="f1"))

0.7075059444280598

In [18]:
np.mean(cross_val_score(SVC(C=0.8, kernel="linear"), X_smo, y_smo, cv=10, scoring="f1"))

0.7185388056230615

In [19]:
np.mean(cross_val_score(RandomForestClassifier(n_estimators=180), X_smo, y_smo, cv=10, scoring="f1"))

0.9080985673979247

In [20]:
np.mean(cross_val_score(GradientBoostingClassifier(n_estimators=150), X_smo, y_smo, cv=10, scoring="f1"))

0.8730414720619756

In [21]:
np.mean(cross_val_score(MLPClassifier(max_iter=1000, solver="adam"), X_smo, y_smo, cv=10, scoring="f1"))

0.8580442240624089

In [22]:
# Train model
nb_model = GaussianNB().fit(X_smo, y_smo)

In [23]:
lr_model = LogisticRegression().fit(X_smo, y_smo)

In [24]:
rf_model = RandomForestClassifier(n_estimators=180).fit(X_smo, y_smo)

In [25]:
gbdt_model = GradientBoostingClassifier(n_estimators=180).fit(X_smo, y_smo)

In [26]:
svm_model = SVC().fit(X_smo, y_smo)

In [27]:
nn_model = MLPClassifier(max_iter=1000).fit(X_smo, y_smo)

In [28]:
# Voting classifier
from sklearn.ensemble import VotingClassifier

vote_model = VotingClassifier(estimators=[
    ('rf',RandomForestClassifier(n_estimators=180)),
    ("gbdt",GradientBoostingClassifier(n_estimators=150)),
    ("mlp",MLPClassifier(max_iter=1000))], voting="soft", weights=[1,2,1]).fit(X_smo, y_smo)

print(vote_model)

VotingClassifier(estimators=[('rf', RandomForestClassifier(n_estimators=180)),
                             ('gbdt',
                              GradientBoostingClassifier(n_estimators=150)),
                             ('mlp', MLPClassifier(max_iter=1000))],
                 voting='soft', weights=[1, 2, 1])


In [29]:
# Process test data

test_data = pd.read_csv("assignment-test.csv")
test_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,7500,15643361,Cullen,477,Germany,Male,34,8,139959.55,2,1,1,189875.83
1,7501,15660403,Fleming,827,Spain,Female,35,0,0.0,2,0,1,184514.01
2,7502,15734578,Craig,726,France,Female,53,1,113537.73,1,0,1,28367.21
3,7503,15583212,Chidozie,600,France,Female,43,5,134022.06,1,1,0,194764.83
4,7504,15694160,Sagese,624,France,Male,37,0,0.0,2,0,0,112104.55


In [30]:
test_row = test_data["RowNumber"]
test_data = test_data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

In [31]:
test_data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,477,Germany,Male,34,8,139959.55,2,1,1,189875.83
1,827,Spain,Female,35,0,0.0,2,0,1,184514.01
2,726,France,Female,53,1,113537.73,1,0,1,28367.21
3,600,France,Female,43,5,134022.06,1,1,0,194764.83
4,624,France,Male,37,0,0.0,2,0,0,112104.55


In [32]:
test_credit = credit_model.transform(np.array(test_data["CreditScore"]).reshape(-1,1).astype("float64"))
test_age = age_model.transform(np.array(test_data["Age"]).reshape(-1,1).astype("float64"))
test_tenure = tenure_model.transform(np.array(test_data["Tenure"]).reshape(-1,1).astype("float64"))
test_balance = balance_model.transform(np.array(test_data["Balance"]).reshape(-1,1).astype("float64"))
test_num_of_product = num_of_product_model.transform(np.array(test_data["NumOfProducts"]).reshape(-1,1).astype("float64"))
test_estimate = estimated_salary_model.transform(np.array(test_data["EstimatedSalary"]).reshape(-1,1).astype("float64"))

test_geo_label = geo_label_model.transform(test_data["Geography"]).reshape(-1,1)
test_geo = geo_model.transform(test_geo_label).toarray()

test_gender_label = gender_label_model.transform(test_data["Gender"]).reshape(-1,1)
test_gender = gender_model.transform(test_gender_label).toarray()

In [33]:
# Concat all the processed features in testing set.
test_x = np.hstack((test_credit, test_geo, test_gender, test_age, test_tenure, test_balance, test_num_of_product, 
                   np.array(test_data["HasCrCard"]).reshape(-1,1), np.array(test_data["IsActiveMember"]).reshape(-1,1),
                   test_estimate))
# test_x

In [34]:
# Feature selection
select_test_x = test_x[:, gbdt.feature_importances_>0.01]

## 5. Evaluation <a name="fifth_part"></a>

In [35]:
nb_res = nb_model.predict(test_x)
nb_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(nb_res)})
nb_res.to_csv("predictions/nb_res.csv", index=None)

In [36]:
lr_res = lr_model.predict(test_x)
lr_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(lr_res)})
lr_res.to_csv("predictions/lr_res.csv", index=None) 

In [37]:
rf_res = rf_model.predict(test_x)
rf_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(rf_res)})
rf_res.to_csv("predictions/rf_res.csv", index=None)

In [38]:
gbdt_res = gbdt_model.predict(test_x)
gbdt_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(gbdt_res)})
gbdt_res.to_csv("predictions/gbdt_res.csv", index=None)

In [39]:
svm_res = svm_model.predict(test_x)
svm_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(svm_res)})
svm_res.to_csv("predictions/svm_res.csv", index=None)

In [40]:
nn_res = nn_model.predict(test_x)
nn_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(nn_res)})
nn_res.to_csv("predictions/nn_res.csv", index=None)

In [41]:
vote_res = vote_model.predict(test_x)
vote_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(vote_res)})
vote_res.to_csv("predictions/vote_res.csv", index=None)