## Business Understanding

Goal(s):

Predict if the customer will subscribe (yes/no) to a term deposit (variable y)

Success Metric(s):

Hit %81 or above accuracy by evaluating with 5-fold cross validation and reporting the average performance score.

## Data Understanding

In [2]:
## Import libraries / modules
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from imblearn.under_sampling import NearMiss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

%matplotlib inline

In [3]:
"""
Load data from CSV
"""
def load_data(csv_file_path):
    df = pd.read_csv(csv_file_path)
    return df

"""
Auto EDA of dataset
"""
def auto_eda(df, title):
    profile = ProfileReport(df, title=title)
    profile.to_file(output_file='../reports/report.html')
    
"""
Change column names
"""
def change_column_name(df, col_index_label):
    column_names = df.columns.tolist()
    for key, value in col_index_label.items():
        df = df.rename(columns={column_names[key]: value})
    return df
    
"""
drop unnecessary columns
"""
def drop_columns_by_index(df, indices):
    df = df.drop(df.columns[indices], axis=1)
    return df

"""
drop rows
"""
def drop_rows(df, indices):
    df = df.drop(indices, axis=0)
    return df

"""
drop duplicate row ignoring numeric columns
"""
def drop_duplicate_rows(df):
    duplicated_rows = df.duplicated(keep='first')
    row_numbers = np.where(duplicated_rows)[0]
    df = drop_rows(df, row_numbers)
    print(pd.DataFrame(row_numbers))
    return df
    
"""
Scale numeric columns
"""
def scale_numeric_data(df, col_indices):
    scaler = MinMaxScaler()
    df.iloc[:, col_indices] = scaler.fit_transform(df.iloc[:, col_indices])
    return df

"""
Encode categorical columns
"""
def encode_cat_columns(df, col_names):
    df_encoded = pd.get_dummies(df, columns = col_names)
    return df_encoded

"""
Return columns
"""
def get_col_names(df):
    return df.columns

"""
Return column indices by name
"""
def get_col_indices(df, col_names):
    column_indices = [df.columns.get_loc(col) for col in col_names]
    return column_indices

"""
Get Test train split. default = 75%
"""
def get_test_train_split(df, target_col_index, random_state=42):
    X = df.drop(df.columns[target_col_index], axis=1)
    y = pd.DataFrame(df.iloc[:, target_col_index])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
    return [X_train, X_test, y_train, y_test]

"""
Return X and y only
"""
def get_X_y(df, target_col_index):
    X = df.drop(df.columns[target_col_index], axis=1)
    y = pd.DataFrame(df.iloc[:, target_col_index])
    return [X, y]
    

"""
Return Forest model for feature importance
"""

def get_forest_model(X_train, y_train):
    forest = RandomForestClassifier(random_state=0)
    forest.fit(X_train, y_train)
    return forest

"""
Retturn LinearSVC model
"""
def get_feature_selection_model(X_train, y_train):
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)
    model = SelectFromModel(lsvc, prefit=True)
    return model

def get_grid_search_cv(model, param_dict, X_train, y_train, cv, refit = True, verbose = False, cv):
    grid = GridSearchCV(model, param_dict, refit=refit, verbose=verbose, cv=cv)
    return grid.fit(X_train, y_train)

def calculate_f1_score(y_test, y_pred, average='weighted'):
    return f1_score(y_test, y_pred, average="weighted")

In [4]:
df = load_data("../data/raw/term-deposit-marketing-2020.csv")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,technician,married,tertiary,no,395,no,no,cellular,3,jun,107,1,no
39996,30,management,single,tertiary,no,3340,no,no,cellular,3,jun,238,3,yes
39997,54,admin,divorced,secondary,no,200,no,no,cellular,3,jun,170,1,yes
39998,34,management,married,tertiary,no,1047,no,no,cellular,3,jun,342,1,no


In [5]:
df2 = change_column_name(df, {4: 'has_credit_default', 6: 'has_housing_loan', 7: 'has_personal_loan', 
                                             8: 'contact_mode', 12: 'num_of_contacts'})
df2.head()

Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,day,month,duration,num_of_contacts,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


### AutoEDA Report generation

In [6]:
auto_eda(df, "Term Deposit Marketing Report")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data cleaning

In [7]:
## Drop day and month column as they do not affect target variable
df_ = drop_columns_by_index(df2, [9,10])
df_

Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,duration,num_of_contacts,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,198,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,technician,married,tertiary,no,395,no,no,cellular,107,1,no
39996,30,management,single,tertiary,no,3340,no,no,cellular,238,3,yes
39997,54,admin,divorced,secondary,no,200,no,no,cellular,170,1,yes
39998,34,management,married,tertiary,no,1047,no,no,cellular,342,1,no


### Feature Scaling

In [8]:
## Normalize numeric columns
df_scaled = scale_numeric_data(df_, [0, 5, 9, 10])
df_scaled

Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,duration,num_of_contacts,y
0,0.513158,management,married,tertiary,no,0.092259,yes,no,unknown,0.053070,0.000000,no
1,0.328947,technician,single,secondary,no,0.073067,yes,no,unknown,0.030704,0.000000,no
2,0.184211,entrepreneur,married,secondary,no,0.072822,yes,yes,unknown,0.015453,0.000000,no
3,0.368421,blue-collar,married,unknown,no,0.086476,yes,no,unknown,0.018707,0.000000,no
4,0.184211,unknown,single,unknown,no,0.072812,no,no,unknown,0.040260,0.000000,no
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,technician,married,tertiary,no,0.076390,no,no,cellular,0.021757,0.000000,no
39996,0.144737,management,single,tertiary,no,0.103127,no,no,cellular,0.048394,0.032258,yes
39997,0.460526,admin,divorced,secondary,no,0.074619,no,no,cellular,0.034567,0.000000,yes
39998,0.197368,management,married,tertiary,no,0.082309,no,no,cellular,0.069540,0.000000,no


In [9]:
## Remove duplicates

## Find indices of duplicate rows while ignoring numeric column and then drop the duplicate rows from dataframe
df_cleaned = drop_duplicate_rows(df_scaled)
df_cleaned


       0
0  26091
1  38952


Unnamed: 0,age,job,marital,education,has_credit_default,balance,has_housing_loan,has_personal_loan,contact_mode,duration,num_of_contacts,y
0,0.513158,management,married,tertiary,no,0.092259,yes,no,unknown,0.053070,0.000000,no
1,0.328947,technician,single,secondary,no,0.073067,yes,no,unknown,0.030704,0.000000,no
2,0.184211,entrepreneur,married,secondary,no,0.072822,yes,yes,unknown,0.015453,0.000000,no
3,0.368421,blue-collar,married,unknown,no,0.086476,yes,no,unknown,0.018707,0.000000,no
4,0.184211,unknown,single,unknown,no,0.072812,no,no,unknown,0.040260,0.000000,no
...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,technician,married,tertiary,no,0.076390,no,no,cellular,0.021757,0.000000,no
39996,0.144737,management,single,tertiary,no,0.103127,no,no,cellular,0.048394,0.032258,yes
39997,0.460526,admin,divorced,secondary,no,0.074619,no,no,cellular,0.034567,0.000000,yes
39998,0.197368,management,married,tertiary,no,0.082309,no,no,cellular,0.069540,0.000000,no


In [10]:
df_cleaned.job.unique()

array(['management', 'technician', 'entrepreneur', 'blue-collar',
       'unknown', 'retired', 'admin', 'services', 'self-employed',
       'unemployed', 'housemaid', 'student'], dtype=object)

#### One-hot encoding categorical columns

In [11]:
col_names = ['job', 'marital', 'education', 'has_credit_default', 'has_housing_loan', 'has_personal_loan', 'contact_mode', 'y']
df_encoded = encode_cat_columns(df_cleaned, col_names)
df_encoded

Unnamed: 0,age,balance,duration,num_of_contacts,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,has_credit_default_yes,has_housing_loan_no,has_housing_loan_yes,has_personal_loan_no,has_personal_loan_yes,contact_mode_cellular,contact_mode_telephone,contact_mode_unknown,y_no,y_yes
0,0.513158,0.092259,0.053070,0.000000,0,0,0,0,1,0,...,0,0,1,1,0,0,0,1,1,0
1,0.328947,0.073067,0.030704,0.000000,0,0,0,0,0,0,...,0,0,1,1,0,0,0,1,1,0
2,0.184211,0.072822,0.015453,0.000000,0,0,1,0,0,0,...,0,0,1,0,1,0,0,1,1,0
3,0.368421,0.086476,0.018707,0.000000,0,1,0,0,0,0,...,0,0,1,1,0,0,0,1,1,0
4,0.184211,0.072812,0.040260,0.000000,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,0.076390,0.021757,0.000000,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
39996,0.144737,0.103127,0.048394,0.032258,0,0,0,0,1,0,...,0,1,0,1,0,1,0,0,0,1
39997,0.460526,0.074619,0.034567,0.000000,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
39998,0.197368,0.082309,0.069540,0.000000,0,0,0,0,1,0,...,0,1,0,1,0,1,0,0,1,0


In [12]:
## Drop column with binary classes
columns = get_col_names(df_encoded)
print(columns)
#df_dropped_binary = drop_columns_by_name(df_encoded,  ['y_no', 'has_personal_loan_no', 'has_housing_loan_no', 'has_credit_default_no'])

Index(['age', 'balance', 'duration', 'num_of_contacts', 'job_admin',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'has_credit_default_no', 'has_credit_default_yes',
       'has_housing_loan_no', 'has_housing_loan_yes', 'has_personal_loan_no',
       'has_personal_loan_yes', 'contact_mode_cellular',
       'contact_mode_telephone', 'contact_mode_unknown', 'y_no', 'y_yes'],
      dtype='object')


In [13]:
binary_cols = ['y_no', 'has_personal_loan_no', 'has_housing_loan_no', 'has_credit_default_no']
col_indices = get_col_indices(df_encoded, binary_cols)
col_indices

[32, 27, 25, 23]

In [14]:
df_binary_dropped = drop_columns_by_index(df_encoded, col_indices)
df_binary_dropped

Unnamed: 0,age,balance,duration,num_of_contacts,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,education_secondary,education_tertiary,education_unknown,has_credit_default_yes,has_housing_loan_yes,has_personal_loan_yes,contact_mode_cellular,contact_mode_telephone,contact_mode_unknown,y_yes
0,0.513158,0.092259,0.053070,0.000000,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
1,0.328947,0.073067,0.030704,0.000000,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
2,0.184211,0.072822,0.015453,0.000000,0,0,1,0,0,0,...,1,0,0,0,1,1,0,0,1,0
3,0.368421,0.086476,0.018707,0.000000,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,0.184211,0.072812,0.040260,0.000000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.447368,0.076390,0.021757,0.000000,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
39996,0.144737,0.103127,0.048394,0.032258,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
39997,0.460526,0.074619,0.034567,0.000000,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
39998,0.197368,0.082309,0.069540,0.000000,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [15]:
cols_after_clean = get_col_names(df_binary_dropped)
cols_after_clean

Index(['age', 'balance', 'duration', 'num_of_contacts', 'job_admin',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'has_credit_default_yes', 'has_housing_loan_yes',
       'has_personal_loan_yes', 'contact_mode_cellular',
       'contact_mode_telephone', 'contact_mode_unknown', 'y_yes'],
      dtype='object')

### L1-based feature selection using LinearSVC
ref. : https://scikit-learn.org/stable/modules/feature_selection.html#:~:text=L1%2Dbased%20feature%20selection,-Linear%20models%20penalized&text=With%20SVMs%20and%20logistic%2Dregression,parameter%2C%20the%20fewer%20features%20selected.

In [16]:
X, y = get_X_y(df_binary_dropped, 29)

In [17]:
X.shape

(39998, 29)

In [18]:
y.shape

(39998, 1)

In [19]:
## Get feature selection model 

feature_selection_model = get_feature_selection_model(X, y)
X_new = feature_selection_model.transform(X)
df_feature_selected = pd.DataFrame(X_new)
df_feature_selected

  y = column_or_1d(y, warn=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.053070,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
1,0.030704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.015453,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
3,0.018707,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,0.040260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
39993,0.021757,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
39994,0.048394,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
39995,0.034567,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39996,0.069540,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [20]:
count_1, count_0 = y['y_yes'].value_counts()[1], y['y_yes'].value_counts()[0]
count_1, count_0

(2896, 37102)

### Undersampling negative class

In [22]:
# Combine X and y to create new dataframe after feature selection
df_feature_selected.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
feature_scaled_df = pd.concat([df_feature_selected, y], axis=1)
feature_scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,y_yes
0,0.053070,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0
1,0.030704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
2,0.015453,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0
3,0.018707,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0
4,0.040260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
39993,0.021757,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
39994,0.048394,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
39995,0.034567,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
39996,0.069540,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0


In [23]:
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_resample(df_feature_selected, y)

In [24]:
X_resampled.shape, y_resampled.shape

((5792, 11), (5792, 1))

## ML Model Selection

### Logistic Regression



In [25]:
## Combine re_samples dataframes to create new dataframe for test-train splitting

df_transformed = pd.concat([X_resampled, y_resampled], axis=1)
df_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,y_yes
0,0.049614,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
1,0.024197,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,0.049614,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
3,0.049614,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
4,0.024197,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5787,0.069134,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1
5788,0.120171,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
5789,0.078284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
5790,0.048394,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1


In [26]:
X_train, X_test, y_train, y_test = get_test_train_split(df_transformed, 11)
##Train logistic regression model
y_train_np = np.array(y_train)
logistic_model = LogisticRegression(random_state=0).fit(X_train, y_train_np.ravel())

#### K-folds cross validation with 5 folds - GridSearchCV

In [27]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
## Defining default params to optimize
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]

In [28]:
search = GridSearchCV(logistic_model, space, scoring='accuracy', n_jobs=-1, cv=cv)
result = search.fit(X_train, y_train_np.ravel())
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.8420822882031539
Best Hyperparameters: {'C': 1e-05, 'penalty': 'none', 'solver': 'newton-cg'}


720 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Cantt Computer\.conda\envs\Happy_Customers_App\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Cantt Computer\.conda\envs\Happy_Customers_App\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Cantt Computer\.conda\envs\Happy_Customers_App\lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver
    raise ValueError("penalty='none'

#### Test Model

In [29]:
logistic_model_post_cv = LogisticRegression(random_state=0, penalty=None, solver='newton-cg').fit(X_train, y_train_np.ravel())
y_pred=logistic_model_post_cv.predict(X_test)


In [30]:
print('Logistic Regression Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred) * 100))

Logistic Regression Model accuracy score: 84.4613


#### F1 Score calculation

In [31]:
score = calculate_f1_score(y_test, y_pred)
print('Logistic Regression Model F1 score: {0:0.3f}'.format(score))

Logistic Regression Model F1 score: 0.844


### SVC - Support Vector Machines

In [56]:
#Create a svm Classifier
svc_default_model = svm.SVC() ## Using RBF by-default

#### GridSearch CV - Folds = 5

In [57]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 

##get trained gridsearchcv model with optimal params
trained_grid_model = get_grid_search_cv(svc_default_model, param_grid, X_train, y_train_np.ravel(), cv)


In [58]:
trained_grid_model.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

#### Test Model

In [59]:
y_pred = trained_grid_model.predict(X_test)


In [60]:
score = calculate_f1_score(y_test, y_pred)
print('SVM model F1 score: {0:0.3f}'.format(score))

SVM model F1 score: 0.854


### Decision Trees

In [61]:
# Create Decision Tree classifer object
decision_tree = DecisionTreeClassifier()

#### GridSearch CV with folds=5

In [62]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
tree_param_dict = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}

##get trained gridsearchcv model with optimal params
trained_grid_model = get_grid_search_cv(decision_tree, tree_param_dict, X_train, y_train_np.ravel(), cv)

In [63]:
print('Best Parameters Decision Tree:', trained_grid_model.best_params_)

Best Parameters Decision Tree: {'criterion': 'entropy', 'max_depth': 11}


#### Test Model

In [80]:
y_pred = trained_grid_model.predict(X_test)
score = calculate_f1_score(y_test, y_pred)
print('Decision tree model F1 score: {0:0.3f}'.format(score))

Decision tree model F1 score: 0.849


### XGBoost

In [47]:
# Create XGBoost classifer object
XGBClassifier_model = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    verbosity=0, nthread=1)

#### GridSearchCV with Folds=5

In [48]:
xgboost_grid_params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [49]:
trained_grid_xg_model = get_grid_search_cv(XGBClassifier_model, xgboost_grid_params, X_train, y_train_np.ravel(), cv)

In [71]:
print('Best Parameters XGBoost:', trained_grid_xg_model.best_params_)

Best Parameters XGBoost: {'colsample_bytree': 0.6, 'gamma': 2, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 0.6}


In [79]:
y_pred = trained_grid_model.predict(X_test)
score = calculate_f1_score(y_test, y_pred)
print('XGBoost model F1 score: {0:0.3f}'.format(score))

XGBoost model F1 score: 0.849


### Random Forest

In [67]:
# Create Random Forest classifer object
random_forest_model = RandomForestClassifier()

#### GridSearchCV Folds=5

In [77]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
random_forest_params = {'criterion':['gini','entropy'],'max_depth':[8,9,10,11,12,15,20,30,40,50,70,90,120,150], 
                        'n_estimators':[10, 100]}

##get trained gridsearchcv model with optimal params
trained_grid_rf_model = get_grid_search_cv(random_forest_model, random_forest_params, X_train, y_train_np.ravel(), cv)

In [78]:
print('Best Parameters Random Forest:', trained_grid_rf_model.best_params_)

Best Parameters Random Forest: {'criterion': 'gini', 'max_depth': 9, 'n_estimators': 100}


In [81]:
y_pred = trained_grid_model.predict(X_test)
score = calculate_f1_score(y_test, y_pred)
print('Random Forest F1 score: {0:0.3f}'.format(score))

Random Forest F1 score: 0.849
