In [26]:
# Libraries
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, RandomizedSearchCV 
from sklearn.linear_model import Perceptron, LogisticRegressionCV, RidgeClassifierCV, SGDClassifier, PassiveAggressiveClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score,mean_absolute_error, confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve, auc, classification_report,precision_score,recall_score,log_loss,f1_score
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier, AdaBoostClassifier
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn import tree
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, LabelEncoder, OneHotEncoder, MaxAbsScaler, RobustScaler, QuantileTransformer
from sklearn.svm import SVC
from sklearn import tree
import pandas_bokeh
from sklearn.decomposition import PCA
from sklearn.calibration import CalibratedClassifierCV
from numpy import mean, std
import pandas.testing as tm
from scipy import stats
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance

# Pipelines
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

#other
from math import sqrt
import inspect

import eli5
#%matplotlib inline



In [27]:
def hasmethod(obj, name):
    return inspect.ismethod(getattr(obj, name, None))

In [28]:
def evaluation(y, y_hat, title = 'Confusion Matrix'):
    cm = confusion_matrix(y, y_hat)
    precision = precision_score(y, y_hat)
    recall = recall_score(y, y_hat)
    accuracy = accuracy_score(y,y_hat)
    f1 = f1_score(y,y_hat)
    print('Recall: ', recall)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('F1: ', f1)
    sns.heatmap(cm,  cmap= 'PuBu', annot=True, fmt='g', annot_kws=    {'size':20})
    plt.xlabel('predicted', fontsize=18)
    plt.ylabel('actual', fontsize=18)
    plt.title(title, fontsize=18)
    
    plt.show();

In [29]:

def metrics_summary(y_test,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    accuracy=accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall =  recall_score(y_test, y_pred) #sensitivity
    specificity = tn / (tn+fp)
    g_mean= sqrt(recall * specificity)
    mse =mean_squared_error(y_test, y_pred, squared=False)
    r2=r2_score(y_test, y_pred)
    ros = roc_auc_score(y_test, y_pred)
    ll = log_loss(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    metrics_collection_dict ={
        'accuracy':['accuracy',accuracy],
        'precision':['precision',precision],
        'recall':['recall',recall],
        'specificity':['specificity',specificity],
        'g_mean':['g_mean',g_mean],
        'mean_square_error':['mean_square_error',mse],
        'r2':['r2',r2],
        'roc_auc_score':['roc_auc_score',ros],
        'log_loss':['log_loss',ll],
        'f1_score':['f1_score',f1]
    } 
    metrics_collection=pd.DataFrame.from_dict(metrics_collection_dict,orient='index',columns=['metric','score'])
    plt.title(model.__class__.__name__)
    ax = sns.barplot(data=metrics_collection.reset_index(), x = 'metric',y="score")
    for item in ax.get_xticklabels():
        item.set_rotation(60)
    plt.show()
    return metrics_collection_dict

In [30]:
#ROC curve

def ROC_Curve_Plot(model,X_test,y_test):
    predProb = model.predict_proba(X_test)
    preds = predProb[:,1]
    fpr, tpr, threshold = roc_curve(y_test, preds,pos_label=1)
    roc_auc = auc(fpr, tpr)
    plt.close()
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    #plt.xlim([0, 1])
    #plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    return fpr,tpr

In [31]:
# Loading Dataset
DATASET_URL = "https://gist.githubusercontent.com/YHYeoh/ad1a7f7170c72d621d05a70637540152/raw/5a6059c199e2c46d2f3d258f03d93cfea98e2749/marketing_campaign.csv"
data = pd.read_csv(DATASET_URL, sep = ';')

pd.set_option('plotting.backend','pandas_bokeh')

data.fillna(method = "ffill", inplace = True)
data.isnull().values.any()

label_encoder = LabelEncoder()
data["Education"] = label_encoder.fit_transform(data["Education"])

data['enroll_year'] = pd.DatetimeIndex(data.Dt_Customer).year
data['enroll_month'] = pd.DatetimeIndex(data.Dt_Customer).month
data['enroll_day'] = pd.DatetimeIndex(data.Dt_Customer).day

data.drop(["ID", 'Dt_Customer',"Z_CostContact","Z_Revenue"], axis=1, inplace=True)

categorical = ['Marital_Status']
numerical = ['Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts',
       'MntFishProducts', 'MntSweetProducts', 'MntGoldProds',
       'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
       'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2',
       'Complain', 'enroll_year', 'enroll_month', 'enroll_day']
numerical_no_bool = ['Education','Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth','enroll_day','enroll_month','enroll_year']
ss = Pipeline(steps=[('scalers',StandardScaler())])
ohe = Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown = 'ignore'))])

models_coef = [LogisticRegressionCV(max_iter= 1200), 
          RidgeClassifierCV(),
          SVC(kernel = 'linear',max_iter= -1), 
          Perceptron(),
          PassiveAggressiveClassifier(), 
         ]

preprocess = ColumnTransformer(
                    transformers=[
                        ('cont', ss, numerical_no_bool),
                        ('cat', ohe, categorical),
                        #('le', le, ordinal),
                        ],remainder='passthrough')

y = data.Response
X = data.drop(['Response'], axis=1)
print("Features: " + str(len(X.columns)))
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.3)

def feature_importance_coef(classifier, feature_names):
	if (hasattr(classifier,'coef_')):
		importance = classifier.coef_[0]
	elif (hasattr(classifier,'coefs_')):
		importance = classifier.coefs_
	elif (hasattr(classifier,'feature_importances_')):
		importance = classifier.feature_importances_
	else:
		print("Cannot extract feature importance, skipping")
		return
		print("Feature Important coef")
		print(importance)
		#importance = results.importances_mean
      # summarize feature importance
    
	for i,v in enumerate(importance):
		print('Feature: %d, Score: %.5f' % (i,v))
	zipped = zip(feature_names, importance)
	df = pd.DataFrame(zipped, columns=["feature", "value"])
	# Sort the features by the absolute value of their coefficient
	df["abs_value"] = df["value"].apply(lambda x: abs(x))
	df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
	df = df.sort_values("abs_value", ascending=False)
	# plot feature importance
	fig, ax = plt.subplots(1, 1, figsize=(16, 9))
	sns.barplot(x="feature",
	            y="value",
	            data=df.head(20),
	           palette=df.head(20)["colors"])
	plt.gcf().subplots_adjust(bottom=0.30)
	ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=14)
	ax.set_title("Top 20 Features for {}".format(classifier.__class__.__name__), fontsize=25)
	ax.set_ylabel("Coef", fontsize=22)
	ax.set_xlabel("Feature Name", fontsize=22)
	plt.show()

def feature_importance(classifier, feature_names):
    importance = classifier.feature_importances_
      # summarize feature importance
	for i,v in enumerate(importance):
        print('Feature: %d, Score: %.5f' % (i,v))
	# plot feature importance
	plt.figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
	plt.bar([x for x in range(len(importance))], importance)
	plt.xticks([x for x in range(len(importance))],[feature_names[x] for x in range(len(importance))],rotation=90)
	plt.title(classifier.__class__.__name__)
	plt.xlabel('Features')
	plt.ylabel('Importance')
	plt.show()


def evaluation(y, y_hat, title):
    cm = confusion_matrix(y, y_hat)
    precision = precision_score(y, y_hat)
    recall = recall_score(y, y_hat)
    accuracy = accuracy_score(y,y_hat)
    f1 = f1_score(y,y_hat)
    print('Recall: ', recall)
    print('Accuracy: ', accuracy)
    print('Precision: ', precision)
    print('F1: ', f1)
    sns.heatmap(cm,  cmap= 'PuBu', annot=True, fmt='g', annot_kws=    {'size':20})
    plt.xlabel('predicted', fontsize=18)
    plt.ylabel('actual', fontsize=18)
    plt.title(title, fontsize=18)
    #plt.show()


def cross_validate(classifier, cv):
    pipeline = Pipeline(steps=[
        ('preprocess', preprocess),
        ('classifier', classifier)
    ])
    train_acc = []
    test_acc = []
    mean = []
    fpr =None
    tpr = None
    
    for train_ind, val_ind in cv.split(X_train, y_train):
        X_t, y_t = X_train.iloc[train_ind], y_train.iloc[train_ind]
        pipeline.fit(X_t, y_t)
        y_hat_t = pipeline.predict(X_t)
        train_acc.append(accuracy_score(y_t, y_hat_t))
        X_val, y_val = X_train.iloc[val_ind], y_train.iloc[val_ind] 
        y_hat_val = pipeline.predict(X_val)
        test_acc.append(accuracy_score(y_val, y_hat_val))

    # ohe_cols = pipeline['preprocess'].transformers_[1][1]['ohe']\
    #                .get_feature_names(categorical)
    ohe_cols = list(pipeline.named_steps['preprocess'].named_transformers_['cat'].named_steps['ohe'].get_feature_names(input_features=categorical))
    # feature_names = numerical.copy()
    # feature_names = [y for x in [feature_names, ohe_cols] for y in x] 
    # print(len(feature_names))
    print("ohe length " + str(len(ohe_cols)))
    numeric_feature_list = list(numerical)
    # test = numeric_feature_list.extend(ohe_cols)
    print(len(numerical))
    for i in ohe_cols:
    	numeric_feature_list.append(i)
    print(len(numeric_feature_list))
    evaluation(y_val, y_hat_val, 'Confusion Matrix | {}'.format(classifier.__class__.__name__))
    metrics_summ = metrics_summary(y_val,y_hat_val)
    if hasmethod(pipeline['classifier'], 'predict_proba'):
        fpr,tpr = ROC_Curve_Plot(pipeline,X_val,y_val)
    print('Mean Training Accuracy: {} | Standard Deviation: {}'.format(np.mean(train_acc),np.std(test_acc)))
    print('Mean Validation Accuracy: {} | Standard Deviation: {}'.format(np.mean(test_acc),np.std(test_acc)))
    print('\n')
    feature_importance_coef(classifier, numeric_feature_list)
    return metrics_summ, fpr,tpr
  

# for model in models_coef:
#   print(model.__class__.__name__)
#   cross_validate(model,KFold())


models_feature_importance = [ 
	#CalibratedClassifierCV() #use to calibrate existing classifier 
	DecisionTreeClassifier(), #no coef 
	KNeighborsClassifier(),#no feat_import, use permutation_importance 
	GaussianNB(), #no feat_import, use permutation_importance 
	LGBMClassifier(),#no coef 
	RandomForestClassifier(), #no coef 
	GradientBoostingClassifier(),#no coef 
	PassiveAggressiveClassifier(), 
	ExtraTreesClassifier(), #no coef 
	XGBClassifier(),
	#BaggingClassifier(), #no feat_import, ensemble classifier , Voting Classifier 
	AdaBoostClassifier(), #no coef 
	#GaussianProcessClassifier(),  #no feat_import, special use case
	MLPClassifier()#no feat_import, special use case
	]


for model in models_feature_importance:
  print(model.__class__.__name__)
  cross_validate(model,KFold())
#another batch for special classifier

# models_permutation = [KNeighborsClassifier(),GaussianNB()]
# enc = OneHotEncoder()
# enc_df = pd.DataFrame(enc.fit_transform(X[["Marital_Status"]]).toarray())
# X = X.join(enc_df)
# X.drop(["Marital_Status"], axis=1, inplace=True)
# columns = X.columns
# X = StandardScaler().fit_transform(X)
# for model in models_permutation:
# 	model = KNeighborsClassifier()
# 	model.fit(X,y)
# 	# perform permutation importance
# 	results = permutation_importance(model, X, y, scoring='accuracy')
# 	# get importance
# 	importance = results.importances_mean
# 	# summarize feature importance
# 	for i,v in enumerate(importance):
# 		print('Feature: %0d, Score: %.5f' % (i,v))
# 	# plot feature importance
# 	zipped = zip(columns, importance)
# 	df = pd.DataFrame(zipped, columns=["feature", "value"])
# 	# Sort the features by the absolute value of their coefficient
# 	df["abs_value"] = df["value"].apply(lambda x: abs(x))
# 	df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
# 	df = df.sort_values("abs_value", ascending=False)
# 	# plot feature importance
# 	fig, ax = plt.subplots(1, 1, figsize=(16, 9))
# 	sns.barplot(x="feature",
# 	            y="value",
# 	            data=df.head(20),
# 	           palette=df.head(20)["colors"])
# 	plt.gcf().subplots_adjust(bottom=0.30)
# 	ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=14)
# 	ax.set_title("Top 20 Features for {}".format(model.__class__.__name__), fontsize=25)
# 	ax.set_ylabel("Coef", fontsize=22)
# 	ax.set_xlabel("Feature Name", fontsize=22)
# 	plt.show()

TabError: inconsistent use of tabs and spaces in indentation (<ipython-input-31-f0b2bf3a34b2>, line 90)

In [None]:
model_result = []
for model in models:
  print(model.__class__.__name__)
  performance = cross_validate(model,KFold())
  model_approximation = [model.__class__.__name__,performance]
  model_result.append(model_approximation) 
  #grid_search(model,KFold(),print_feat = True)

In [None]:
#obtain nameList
nameList = []

for model in model_result:
    nameList.append(model[0])

In [None]:
#plot all metric
metric_list = ['accuracy','precision','recall','specificity','g_mean'
                   ,'mean_square_error','r2','roc_auc_score','log_loss','f1_score']

for metric in metric_list:
    resultList = []
    for model in model_result:
        resultList.append(model[1][0][metric][1])
    accDF = pd.DataFrame(list(zip(nameList,resultList)),columns=['trained_model',metric])
    plt.title("Models' "+metric)
    ax = sns.barplot(data=accDF.sort_values(metric,ascending=False),orient='h',palette ="Paired" , y = 'trained_model',x=metric)
    plt.show()



In [None]:
#plot auc_curve for predict_proba supported model
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])
for model in model_result:
    fpr = model[1][1]
    if(fpr is not None):
        tpr = model[1][2]
        auc = model[1][0]['roc_auc_score'][1]
        result_table = result_table.append({'classifiers':model[0],
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc}, ignore_index=True)
        
result_table.set_index('classifiers', inplace=True)


fontP = FontProperties()
fontP.set_size('large')


fig = plt.figure(figsize=(8,6))


for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend( title='Models', bbox_to_anchor=(1.05, 0.85), loc='upper left', prop=fontP)
plt.show()

In [None]:
# Check for outlier in column
pd.set_option('max_columns', None)
print(data.describe())

# Plot scatter plot and box plot
sns.boxplot(x=data['Income'])
fig, ax = plt.subplots(figsize=(16,8))
ax.scatter(data['Response'], data['Income'])
ax.set_xlabel('Response?')
ax.set_ylabel('Income')
plt.show()

In [None]:
#Try to correct the outlier
print(data.dtypes.value_counts())
data = data[(np.abs(stats.zscore(data[['Income']])) < 3)]

In [None]:
# Plot again to check result
sns.boxplot(x=data['Income'])
fig, ax = plt.subplots(figsize=(16,8))
ax.scatter(data['Response'], data['Income'])
ax.set_xlabel('Response?')
ax.set_ylabel('Income')
plt.show()