# Comparison of Different Classification Methods
## Topic: How many Customers Stay

### Table of Content:
1. [Data Gathering](#first_part)
2. [Data Exploration](#second_part)
3. [Data Preprocessing](#third_part)
4. [Model Building](#fourth_part)
5. [Evaluation](#fifth_part)

In [5]:
# Import packages
import pandas as pd
import numpy as np
from pycaret.classification import *

## 1. Data Gathering <a name="first_part"></a>

In [8]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15682273,Burns,683,France,Female,38,5,127616.56,1,1,0,123846.07,0
1,1,15694321,Su,619,France,Female,28,3,0.0,2,1,0,53394.12,0
2,2,15807194,Iweobiegbulam,718,Spain,Male,34,5,113922.44,2,1,0,30772.22,0
3,3,15643966,Goforth,616,Germany,Male,45,3,143129.41,2,0,1,64327.26,0
4,4,15697686,Stewart,787,France,Female,40,6,0.0,2,1,1,84151.98,0


## 2. Data Exploration <a name="second_part"></a>

In [9]:
print(f'Dimension of train dataset: {train_data.shape}\n')

print(f'Missing values: \n{train_data.isnull().sum()}')

Dimension of train dataset: (7500, 14)

Missing values: 
RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


## 3. Data Preprocessing <a name="third_part"></a>

In [10]:
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
ignore_features = ['RowNumber', 'CustomerId', 'Surname']

In [12]:
exit_clf = setup(data = train_data, target='Exited', train_size=0.8, session_id=123, \
                categorical_features=categorical_features, \
                numeric_features=numeric_features, ignore_features=ignore_features) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Exited
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(7500, 14)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


In [13]:
help(get_config)

Help on function get_config in module pycaret.classification:

get_config(variable: str)
    This function retrieves the global variables created when initializing the 
    ``setup`` function. Following variables are accessible:
    
    - X: Transformed dataset (X)
    - y: Transformed dataset (y)  
    - X_train: Transformed train dataset (X)
    - X_test: Transformed test/holdout dataset (X)
    - y_train: Transformed train dataset (y)
    - y_test: Transformed test/holdout dataset (y)
    - seed: random state set through session_id
    - prep_pipe: Transformation pipeline
    - fold_shuffle_param: shuffle parameter used in Kfolds
    - n_jobs_param: n_jobs parameter used in model training
    - html_param: html_param configured through setup
    - create_model_container: results grid storage container
    - master_model_container: model storage container
    - display_container: results display container
    - exp_name_log: Name of experiment
    - logging_param: log_experiment par

In [14]:
get_config('X_train')

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_0,IsActiveMember_0
28,603.0,39.0,9.0,76769.679688,1.0,48224.718750,1.0,0.0,0.0,0.0,1.0,1.0
799,698.0,44.0,10.0,116363.367188,2.0,198059.156250,0.0,1.0,0.0,1.0,0.0,1.0
4495,588.0,41.0,2.0,131341.453125,2.0,7034.939941,0.0,1.0,0.0,1.0,1.0,0.0
658,537.0,26.0,7.0,106397.750000,1.0,103563.226562,1.0,0.0,0.0,1.0,1.0,1.0
1035,509.0,29.0,1.0,0.000000,2.0,69113.140625,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5218,685.0,33.0,6.0,0.000000,1.0,58458.261719,1.0,0.0,0.0,1.0,0.0,1.0
4060,686.0,25.0,1.0,0.000000,2.0,16459.369141,1.0,0.0,0.0,1.0,1.0,0.0
1346,574.0,41.0,1.0,0.000000,2.0,70550.000000,1.0,0.0,0.0,1.0,1.0,1.0
3454,715.0,42.0,6.0,0.000000,2.0,128745.687500,0.0,0.0,1.0,1.0,0.0,0.0


In [None]:
remove_outliers = True

In [15]:
compare_models(fold = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8593,0.8646,0.4738,0.7531,0.5814,0.5022,0.5218,0.15
rf,Random Forest Classifier,0.8588,0.8492,0.4633,0.7592,0.5752,0.4965,0.5184,0.166
catboost,CatBoost Classifier,0.8575,0.8631,0.4811,0.738,0.5821,0.5008,0.5177,1.022
lightgbm,Light Gradient Boosting Machine,0.852,0.8503,0.4851,0.7064,0.5748,0.489,0.5019,0.486
et,Extra Trees Classifier,0.8517,0.8414,0.4375,0.7381,0.549,0.4672,0.4903,0.14
ada,Ada Boost Classifier,0.8508,0.8435,0.4641,0.714,0.5618,0.4769,0.4933,0.06
xgboost,Extreme Gradient Boosting,0.8458,0.8368,0.4811,0.6784,0.5628,0.4726,0.4829,0.42
ridge,Ridge Classifier,0.808,0.0,0.1558,0.6516,0.2493,0.1848,0.2484,0.008
lda,Linear Discriminant Analysis,0.807,0.7745,0.2535,0.5761,0.3513,0.2577,0.288,0.008
dt,Decision Tree Classifier,0.7937,0.6912,0.5165,0.5011,0.5083,0.3779,0.3782,0.014


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

### Gradient Boosting Classifier

In [16]:
gbc = create_model('gbc', fold=5)
print(gbc)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8658,0.8751,0.498,0.7688,0.6044,0.528,0.5461
1,0.86,0.8675,0.4677,0.7632,0.58,0.5017,0.5234
2,0.8608,0.8689,0.4919,0.7485,0.5937,0.514,0.5305
3,0.8467,0.8529,0.4234,0.7192,0.533,0.4485,0.4711
4,0.8633,0.8586,0.4879,0.7658,0.5961,0.5186,0.5377
Mean,0.8593,0.8646,0.4738,0.7531,0.5814,0.5022,0.5218
SD,0.0067,0.0079,0.0272,0.0183,0.0255,0.0281,0.0264


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [None]:
# parameters for Gradient Boosting Classifier

gbc_params = {'loss': ,
              'learning_rate': [1, 0.5, 0.25, 0.1, 0.05, 0.01],
              'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200],
              'max_depth': np.linspace(1, 32, 32, endpoint=True),
              'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
              'min_samples_leafs' : np.linspace(0.1, 0.5, 5, endpoint=True),
              'weights': 
             }

In [None]:
tuned_gbc = tune_model(gbc, fold=5, custom_grid = gbc_params,  optimize = 'F1', search_algorithm = 'grid')
tuned_gbc

In [None]:
plot_model(tuned_gbc)
plot_model(tuned_gbc, plot='confusion_matrix')

### Random Forest Classifier

In [17]:
rf = create_model('rf', fold=5)
print(rf)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8725,0.8631,0.498,0.8092,0.6165,0.5452,0.5684
1,0.8558,0.8608,0.4556,0.7483,0.5664,0.486,0.5075
2,0.8558,0.8463,0.4758,0.7329,0.577,0.4948,0.5116
3,0.8492,0.8251,0.4153,0.741,0.5323,0.4508,0.4776
4,0.8608,0.8507,0.4718,0.7647,0.5835,0.5056,0.5268
Mean,0.8588,0.8492,0.4633,0.7592,0.5752,0.4965,0.5184
SD,0.0078,0.0135,0.0275,0.0271,0.0272,0.0305,0.0296


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=123, verbose=0,
                       warm_start=False)


### CatBoost Classifier

In [18]:
catboost = create_model('catboost', fold=5)
print(catboost)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8683,0.878,0.5101,0.773,0.6146,0.5392,0.5562
1,0.855,0.8662,0.4718,0.7312,0.5735,0.491,0.5082
2,0.8517,0.8601,0.496,0.6989,0.5802,0.4932,0.5039
3,0.8458,0.8522,0.4315,0.7086,0.5363,0.4504,0.4703
4,0.8667,0.8593,0.496,0.7785,0.6059,0.5304,0.5499
Mean,0.8575,0.8631,0.4811,0.738,0.5821,0.5008,0.5177
SD,0.0087,0.0086,0.0277,0.0326,0.0276,0.0318,0.0318


<catboost.core.CatBoostClassifier object at 0x7fa4362740a0>


  return linalg.solve(A, Xy, sym_pos=True,
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  return linalg.solve(A, Xy, sym_pos=True,
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  return linalg.solve(A, Xy, sym_pos=True,
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
from datetime import date
save_date = str(date.today().day)+date.today().strftime("%b")+str(date.today().year)

In [10]:
# Concat all the processed features in training set.

final_feature = np.hstack((credit_score_mm, geo_model, gender_model, age_mm, tenure_mm, balance_mm, num_of_product_mm, 
           np.array(x["HasCrCard"]).reshape(-1,1), np.array(x["IsActiveMember"]).reshape(-1,1), estimated_salary_mm))
final_feature.shape

(7500, 14)

In [11]:
# Feature selection
gbdt = GradientBoostingClassifier()
gbdt.fit(final_feature, y)

GradientBoostingClassifier()

In [12]:
gbdt.feature_importances_

array([1.91091078e-02, 7.62158108e-04, 3.15125905e-02, 1.68546143e-04,
       1.26936023e-03, 2.19411770e-02, 3.69039663e-04, 3.94546224e-01,
       5.22629665e-03, 8.59283245e-02, 3.04250578e-01, 3.22021375e-04,
       1.18141417e-01, 1.64531587e-02])

In [13]:
# Select importance >= 0.01
select_final_feature = final_feature[:, gbdt.feature_importances_>0.01]
select_final_feature.shape

(7500, 8)

In [14]:
Counter(y)

Counter({0: 5941, 1: 1559})

In [15]:
# oversample = SMOTE()
# X, y = oversample.fit_resample(final_feature, y)

In [16]:
smo = SMOTE()
X_smo, y_smo = smo.fit_resample(final_feature, y)

In [17]:
Counter(y_smo)

Counter({0: 5941, 1: 5941})

## 4. Model Building <a name="fourth_part"></a>

In [18]:
np.mean(cross_val_score(GaussianNB(), X_smo, y_smo, cv=10, scoring="f1"))

0.6297337971250551

In [19]:
np.mean(cross_val_score(LogisticRegression(), X_smo, y_smo, cv=10, scoring="f1"))

0.7016566489735775

In [None]:
np.mean(cross_val_score(SVC(C=0.8, kernel="linear"), X_smo, y_smo, cv=10, scoring="f1"))

In [None]:
np.mean(cross_val_score(RandomForestClassifier(n_estimators=180), X_smo, y_smo, cv=10, scoring="f1"))

In [None]:
np.mean(cross_val_score(GradientBoostingClassifier(n_estimators=150), X_smo, y_smo, cv=10, scoring="f1"))

In [None]:
np.mean(cross_val_score(MLPClassifier(max_iter=1000, solver="adam"), X_smo, y_smo, cv=10, scoring="f1"))

In [None]:
# Train model
nb_model = GaussianNB().fit(X_smo, y_smo)

In [None]:
lr_model = LogisticRegression().fit(X_smo, y_smo)

In [None]:
rf_model = RandomForestClassifier(n_estimators=180).fit(X_smo, y_smo)

In [None]:
gbdt_model = GradientBoostingClassifier(n_estimators=180).fit(X_smo, y_smo)

In [None]:
svm_model = SVC().fit(X_smo, y_smo)

In [None]:
nn_model = MLPClassifier(max_iter=1000).fit(X_smo, y_smo)

In [None]:
# Voting classifier
from sklearn.ensemble import VotingClassifier

vote_model = VotingClassifier(estimators=[
    ('rf',RandomForestClassifier(n_estimators=180)),
    ("gbdt",GradientBoostingClassifier(n_estimators=150)),
    ("mlp",MLPClassifier(max_iter=1000))], voting="soft", weights=[1,2,1]).fit(X_smo, y_smo)

print(vote_model)

In [None]:
# Process test data

test_data = pd.read_csv("assignment-test.csv")
test_data.head()

In [None]:
test_row = test_data["RowNumber"]
test_data = test_data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

In [None]:
test_data.head()

In [None]:
test_credit = credit_model.transform(np.array(test_data["CreditScore"]).reshape(-1,1).astype("float64"))
test_age = age_model.transform(np.array(test_data["Age"]).reshape(-1,1).astype("float64"))
test_tenure = tenure_model.transform(np.array(test_data["Tenure"]).reshape(-1,1).astype("float64"))
test_balance = balance_model.transform(np.array(test_data["Balance"]).reshape(-1,1).astype("float64"))
test_num_of_product = num_of_product_model.transform(np.array(test_data["NumOfProducts"]).reshape(-1,1).astype("float64"))
test_estimate = estimated_salary_model.transform(np.array(test_data["EstimatedSalary"]).reshape(-1,1).astype("float64"))

test_geo_label = geo_label_model.transform(test_data["Geography"]).reshape(-1,1)
test_geo = geo_model.transform(test_geo_label).toarray()

test_gender_label = gender_label_model.transform(test_data["Gender"]).reshape(-1,1)
test_gender = gender_model.transform(test_gender_label).toarray()

In [None]:
# Concat all the processed features in testing set.
test_x = np.hstack((test_credit, test_geo, test_gender, test_age, test_tenure, test_balance, test_num_of_product, 
                   np.array(test_data["HasCrCard"]).reshape(-1,1), np.array(test_data["IsActiveMember"]).reshape(-1,1),
                   test_estimate))
# test_x

In [None]:
# Feature selection
select_test_x = test_x[:, gbdt.feature_importances_>0.01]

## 5. Evaluation <a name="fifth_part"></a>

In [None]:
nb_res = nb_model.predict(test_x)
nb_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(nb_res)})
nb_res.to_csv("nb_res.csv", index=None)
# Resulting F1 score: 

In [None]:
lr_res = lr_model.predict(test_x)
lr_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(lr_res)})
lr_res.to_csv("lr_res.csv", index=None)
# Resulting F1 score: 

In [None]:
rf_res = rf_model.predict(test_x)
rf_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(rf_res)})
rf_res.to_csv("rf_res.csv", index=None)
# Resulting F1 score: 

In [None]:
gbdt_res = gbdt_model.predict(test_x)
gbdt_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(gbdt_res)})
gbdt_res.to_csv("gbdt_res.csv", index=None)
# Resulting F1 score: 

In [None]:
svm_res = svm_model.predict(test_x)
svm_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(svm_res)})
svm_res.to_csv("svm_res.csv", index=None)
# Resulting F1 score: 

In [None]:
nn_res = nn_model.predict(test_x)
nn_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(nn_res)})
nn_res.to_csv("nn_res.csv", index=None)
# Resulting F1 score: 

In [None]:
vote_res = vote_model.predict(test_x)
vote_res = pd.DataFrame({"RowNumber":list(test_row), "Exited":list(vote_res)})
vote_res.to_csv("vote_res.csv", index=None)
# Resulting F1 score: 