# Setup

In [3]:
# import dependencies
%matplotlib inline

#Manipulating dataframes
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# System Options
import math, time, random
import os
import warnings
warnings.filterwarnings("ignore")

# Datetime
import datetime
today = datetime.date.today().strftime('%Y%m%d')

# data visualisation
import missingno
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Saving models
import pickle

# Pre-processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder, label_binarize

# Supervised Learning
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge, Lars

# machine learning
import catboost
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNetCV, RidgeCV, LassoCV, LarsCV
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier, Pool, cv

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Working directory
# authenticate 
from google.colab import auth
auth.authenticate_user()

# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

# Connect to google sheets
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

path = "/content/drive/My Drive/"

# get dataframe
df = pd.DataFrame()
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Using Catboost

In [None]:
# separate data into binary and continuous parts for different encoding, for binary, one hot encode will do
# For df_continuous we can also apply one hot encoding
# we use a label encoder (similar to on hot encoder) except for every column we give it a new name



In [None]:
# function that runs the requested algorithm and returns accurate metrics

def fit_ml_algo(algo, X_train, y_train, cv):
    # one pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    # cross validation
    train_pred = model_selection.cross_val_predict(algo, X_train, y_train, cv=cv, n_jobs = -1)
    
    # cross validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

In [None]:
# split the dataframe into data and labels
X_train = selected_df.drop('target_var', axis=1) # data
y_train = selected_df.target_var # labels

In [None]:
# Logistic regression
start_time = time.time()
train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), X_train, y_train, 10)

log_time = (time.time() - start_time)

print("Accuracy: %s" % acc_log)
print("Accuracy CV 10 Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

In [None]:
# K nearest neighbours 
start_time = time.time()
train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), X_train, y_train, 10)

knn_time = (time.time() - start_time)

print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10 Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))

In [None]:
# Gaussian Naive Bayes 
start_time = time.time()
train_pred_gaussian, acc_gaussian, acc_cv_gaussian= fit_ml_algo(GaussianNB(), X_train, y_train, 10)

gaussian_time = (time.time() - start_time)

print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10 Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))

In [None]:
# SVC 
start_time = time.time()
train_pred_svc, acc_svc, acc_cv_svc = fit_ml_algo(LinearSVC(), X_train, y_train, 10)

svc_time = (time.time() - start_time)

print("Accuracy: %s" % acc_svc)
print("Accuracy CV 10 Fold: %s" % acc_cv_svc)
print("Running Time: %s" % datetime.timedelta(seconds=svc_time))

In [None]:
# Stochastic Gradient Descent
start_time = time.time()
train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), X_train, y_train, 10)

sgd_time = (time.time() - start_time)

print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10 Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

In [None]:
# Stochastic Gradient Descent
start_time = time.time()
train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), X_train, y_train, 10)

sgd_time = (time.time() - start_time)

print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10 Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

In [None]:
# Gradient Boost trees
start_time = time.time()
train_pred_gbt, acc_gbt, acc_cv_gbt = fit_ml_algo(GradientBoostingClassifier(), X_train, y_train, 10)

gbt_time = (time.time() - start_time)

print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10 Fold: %s" % acc_cv_gbt)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

In [None]:
# defne the categorical features for the catboost model
cat_features = np.where(X_train.dtypes != np.float)[0]

# use catboost Catboost Pool() function to pool together the training data and categorical feature labels
train_pool = Pool(X_train, y_train, cat_features)

In [None]:
# catboost mdoel definition
catboost_model = CatBoostClassifier(iterations = 1000, custom_loss=['Accuracy'], loss_function='Logloss')

In [None]:
# fit catboost model
catboost_model.fit(train_pool, plot=True)

In [None]:
#catboost accuracy
acc_catboost = round(catboost_model.score(X_train, y_train) * 100, 2)

In [None]:
# catboost cross validation
start_time = time.time()

# set params for cross-validation as same as other models
cv_params = catboost_model.get_params()

# run the cross validation for 10 folds (same as the other models)
cv_data = cv(train_pool, cv_params, fold_count=10, plot=True)

# find total time taken

catboost_time = time.time() - start_time

In [None]:
# print out catboost results
print('---CatBoost Metrics---')
print('Accuracy: {}'.format(acc_catboost))
print('Accuracy scores on CV 10 fold: {}'.format(acc_cv_boost))
print('Running Time: {}'.format(datetime.timedelta(seconds=catboost_time)))