In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Model Building
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
%matplotlib inline

# loading library
from sklearn import metrics
from sklearn.metrics import accuracy_score # calculate accuracy measures and confusion matrix
from sklearn.metrics import confusion_matrix # Creating  a confusion matrix,which compares the y_test and y_pred
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

# Part 1

## Q1 Data Understanding

In [3]:
#1.A
Normal = pd.read_csv('Normal.csv') #reading the dataset
Type_H = pd.read_csv('Type_H.csv') #reading the dataset
Type_S = pd.read_csv('Type_S.csv') #reading the dataset

In [None]:
#1.B
#Normal Dataframe
print ('Shape of Normal Dataframe:',Normal.shape)
print ('Columns of Normal Dataframe:',Normal.columns)

#Type_H Dataframe
print ('Shape of Type_H Dataframe:',Type_H.shape)
print ('Columns of Type_H Dataframe:',Type_H.columns)

#Type_S Dataframe
print ('Shape of Type_S Dataframe:',Type_S.shape)
print ('Columns of Type_S Dataframe:',Type_S.columns)

In [None]:
#1.C
print('Data Types of Normal Dataframe: ',Normal.dtypes)  # To get the data types of all the columns in dataframe
print('Data Types of Type_H Dataframe: ',Type_H.dtypes)
print('Data Types of Type_S Dataframe: ',Type_S.dtypes)

#1.C Observation
    1. Basis above observation/results we can say that all the three dataframes have same column names
    2. The data types of the column name is also same across all the three dataframes.


In [None]:
#1.D
print('Data Types of Normal Dataframe: ',Normal.dtypes)  # To get the data types of all the columns in dataframe
print('Data Types of Type_H Dataframe: ',Type_H.dtypes)
print('Data Types of Type_S Dataframe: ',Type_S.dtypes)

#Alternative
print('Data Types of Normal Dataframe: ',Normal.info())  # To get the data types of all the columns in dataframe
print('Data Types of Type_H Dataframe: ',Type_H.info())
print('Data Types of Type_S Dataframe: ',Type_S.info())

In [None]:
#1.E
print(Normal['Class'].unique())
print(Type_H['Class'].unique())
print(Type_S['Class'].unique())

print(Normal['Class'].value_counts())
print(Type_H['Class'].value_counts())
print(Type_S['Class'].value_counts())

Basis above results, we can say that Normal Dataframe contains these '['Normal' 'Nrmal']'classes while Type_H contains these'['Type_H' 'type_h']' classes and Type_S contains these'['Type_S' 'tp_s']'classes. The three dataframe consist of three differnt types of people.

Classes entry can be unified later.

## Q2 Data Preparation

In [None]:
#Q2.A
# this will replace "Nrmal" with "Normal"
Normal = Normal.replace(to_replace ="Nrmal",value ="Normal")
# this will replace "type_h" with "Type_H"
Type_H = Type_H.replace(to_replace ="type_h",value ="Type_H")
# this will replace "tp_s" with "Type_S"
Type_S = Type_S.replace(to_replace ="tp_s",value ="Type_S")

print(Normal['Class'].unique())
print(Type_H['Class'].unique())
print(Type_S['Class'].unique())

Now only one type of class exist for each dataframe

In [None]:
#Q2.B
dataframes = [Normal, Type_H, Type_S]

Medical = pd.concat(dataframes) #combine dataframes through Concat method
print('Shape of new dataframe after combining',Medical.shape)

print(Medical['Class'].unique())

In [None]:
#Q2.C
Medical.head(5)

In [None]:
#Q2.D
#Count of missing for each feature
print(Medical.isnull().sum())
#to get percentage of missing
print(Medical.isnull().sum() * 100 / len(Medical))

# to check if any other ?/NAN present 
for i in Medical.columns:
    print(Medical[i].unique())

No null values for Medical Dataframe.

In [None]:
#Q2.E
Medical.describe() # to get Min,Max median, Q1 and Q3

## Q3 Data Analysis

In [None]:
#Q3.A
Medical_1 = Medical[["P_incidence","P_tilt","L_angle","S_slope","P_radius","S_Degree"]] #to remove target variable
# ax = sns.heatmap(Medical_1, cmap="YlGnBu")
grid_kws = {"height_ratios": (.9, .05), "hspace": .3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws)
ax = sns.heatmap(Medical_1, ax=ax, cmap="YlGnBu",
                 cbar_ax=cbar_ax,
                 cbar_kws={"orientation": "horizontal"})

The above graph is difficult to read

In [None]:
# Let's mask the above triangle 
corr = Medical_1.corr()
masking = np.zeros_like(corr)
masking[np.triu_indices_from(masking)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(7, 5))
    ax = sns.heatmap(corr, mask=masking, cmap="YlGnBu",vmax=.5, square=True, annot = True)

In [None]:
#Q3.B
print(Medical.corr())

#### A. Features having stronger correlation with correlation value.

##### P_incidence is highly correlated with all the remaning features except P_radius
    1. P_incidence, S_slope = 0.814960 (very strong)
    2. P_incidence, L_angle = 0.717282 (strong)
    3. P_incidence, S_Degree = 0.638743
    4. P_incidence, P_tilt = 0.629199
    5. S_slope, L_angle = 0.598387


#### B. Features having weaker correlation with correlation value

    1. S_Degree and P_radius = -0.026065
    2. P_radius and L_angle = -0.080344 
    3. P_radius and P_tilt = 0.032668
    4. P_tilt and S_slope = 0.062345

In [None]:
Medical.describe(include = 'all')

In [None]:
#cannot reindex from a duplicate axis : to remove this error IN FUTURE
Medical.reset_index(inplace=True)
Medical[Medical.index.duplicated()]

In [None]:
Medical = Medical.drop(labels= "index" , axis = 1)
Medical.shape

In [None]:
#Q3.C
sns.pairplot(Medical, hue="Class",diag_kind="hist")

 Insights:  
    1. L_angle, S_slope and P_tilt are positively related to P_incidence
    2. L_angle and S_slope are also positively related
    3. S_Degree is somewhat positively skewed
    4. Type_S class genrally have higher value for all the features except P_Radius

In [None]:
#Q3.D
sns.jointplot(data=Medical, x="P_incidence", y="S_slope", kind="reg")
sns.jointplot(data=Medical, x="P_incidence", y="S_slope", hue="Class")

    1. For most of the Type_S class, P_incidence is high compared to Normal and Type_H clas i.e., Type_S has large  values for P_incidence compared to other two.
    2. Also,P_incidence and S_slope is positively correlated for all the three class of people
    3. P_incidence is little negatively skewed for Type_S Class

In [None]:
#Q3.E
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(20,10))
plt.ylim(-20,200,80)
ax = sns.boxplot(data=Medical,palette="Set2")

    1. P_incidence is normally distributed with very few outliers
    2. P_tilt is also normally distributed with few outliers but more than P_incidence. It's value goes in negative for few cases.
    3. L_angle is little positively skewed with one outlier
    4. S_Slope is very little positively skewed with one outlier
    5. P_radius is normally distributed with outliers on both the sides of whiskers. Also, it's value are comparitively higher than other features which hints for scaling.
    6. S_Degree is positively skewed with outliers on the positive side. It's value also goes in negative for few cases.

## Q4. Model Building

In [None]:
Medical.Class.unique()

In [None]:
# to label encode the target variable
from sklearn.preprocessing import LabelEncoder
class_label_encoder = LabelEncoder()

Medical.iloc[:,-1] = class_label_encoder.fit_transform(Medical.iloc[:,-1])

In [None]:
Medical.info()

In [None]:
Medical['Class'].unique()

In [None]:
# Medical['Class'] = Medical.Class.astype('category')
# Medical.info
Medical.groupby(["Class"]).count()

In [None]:
Medical.head()

In [None]:
Medical.var() #to check variance in features
#Insight: Good variance among independent features

In [None]:
#Q4.A
# Create a separate dataframe consisting only of the features i.e independent attributes
X = Medical.drop(labels= ['Class'] , axis = 1)
y = Medical["Class"]

In [None]:
# convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# It is always adviced to scale numeric attributes in models that calculate distances.

XScaled  = X.apply(zscore)  # convert all attributes to Z scale 

XScaled.describe()

In [None]:
#Q4.B
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
#Q4.C
NNH = KNeighborsClassifier(n_neighbors= 3)
# Call Nearest Neighbour algorithm
NNH.fit(X_train, y_train)

In [None]:
#Q4.D

In [None]:
# predict the response for train
# predict the response
y_pred_train = NNH.predict(X_train)

# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_train, y_pred_train))
print('Recall Score:',recall_score(y_train, y_pred_train,average="weighted"))
print('Precision Score:',precision_score(y_train, y_pred_train,average="weighted")) #because three categories
print('F1 Score:',f1_score(y_train, y_pred_train,average="weighted"))

cm = confusion_matrix(y_train, y_pred_train)
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm,
                     index = ["0", "1","2"], 
                     columns = ["0", "1","2"])
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

print ('Classification Report : ')
print (classification_report(y_train, y_pred_train, target_names=["0", "1","2"]))


In [None]:
# predict the response for test 
y_pred = NNH.predict(X_test)

# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred,average="weighted"))
print('Precision Score:',precision_score(y_test, y_pred,average="weighted")) #because three categories
print('F1 Score:',f1_score(y_test, y_pred,average="weighted"))

cm = confusion_matrix(y_test, y_pred)
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm,
                     index = ["0", "1","2"], 
                     columns = ["0", "1","2"])
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

print ('Classification Report : ')
print (classification_report(y_test, y_pred, target_names=["0", "1","2"]))

# Q5. Performance Improvement

In [None]:
#Q5.A
from sklearn.model_selection import GridSearchCV #this will help us in finding the best hyperparameters

In [None]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
weights = ['uniform', 'distance']

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p, weights = weights)

#Create new KNN object
knn_2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10,scoring = 'accuracy')

#Fit the model
best_model = clf.fit(X,y)

In [None]:
print(best_model.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors= 8,weights =  'distance',leaf_size=1,p=2)
knn.fit(X_train, y_train) 

In [None]:
# predict the response
y_pred = knn.predict(X_test)
# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred,average="weighted"))
print('Precision Score:',precision_score(y_test, y_pred,average="weighted"))
print('F1 Score:',f1_score(y_test, y_pred,average="weighted"))

cm = confusion_matrix(y_test, y_pred)
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm,
                     index = ["0", "1","2"], 
                     columns = ["0", "1","2"])
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

print ('Classification Report : ')
print (classification_report(y_test, y_pred, target_names=["0", "1","2"]))

In [None]:
def rightly_identified(actual, predictions):
    a = pd.DataFrame()  # empty dataframe

    for i in np.sort(actual.unique()):
        try:  # because we get a key error if we use value_counts()[i] when the given class i does not exist in the value_count() array
            a.loc[i,'total_instances'] = len(actual[actual == i]) # value count of a given class i
            a.loc[i, 'identified_right'] = pd.Series(predictions[actual == i]).value_counts()[i] # number of instances where a given class i was identified correctly
            a.loc[i, '%'] = round(100*pd.Series(predictions[actual == i]).value_counts()[i]/len(actual[actual == i]), 1)  # percentage of predictions that were correct for a given class
        except:  # if the key error discussed above occurs, assign the percentage = 0
            a.loc[i, '%'] = 0.0
    return a



In [None]:
pred_vs_actual_lr = rightly_identified(y_test, y_pred)
pred_vs_actual_lr

In [None]:
#Q5.B

There is an overall improvement in Precision, accuracy, recall and F1 score for the model by 2% at an overall level. But at an individual class level, "1" class performance improved drastically in terms of all the above metrics. There is a +10% improvement in precsion, recall, f1-score for "1" class.

In [None]:
#Q5.C

Below parameters helped in improving the model performance:
    1. leaf_size: 1 earlier it was 30
    2. p: 2
    3. n_neighbors: 8 earlier it was 3
    4. weights: distance earlier it was uniform

Note: scaling was also tried but it didn't increase any score hence removed.

In [None]:
# Split X and y into training and test set in 80:20 ratio
XScaled  = X.apply(zscore)  # convert all attributes to Z scale 

XScaled.describe()

X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.20, random_state=1)

NNH = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' )

# Call Nearest Neighbour algorithm

NNH.fit(X_train, y_train)

# For every test data point, predict it's label based on 5 nearest neighbours in this model. The majority class will 
# be assigned to the test data point

predicted_labels = NNH.predict(X_test)
NNH.score(X_test, y_test)


# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, predicted_labels))
print('Recall Score:',recall_score(y_test, predicted_labels,average="weighted"))
print('Precision Score:',precision_score(y_test, predicted_labels,average="weighted"))
print('F1 Score:',f1_score(y_test, predicted_labels,average="weighted"))

cm = confusion_matrix(y_test,predicted_labels)
# Creating a dataframe for a array-formatted Confusion matrix,so it will be easy for plotting.
cm_df = pd.DataFrame(cm,
                     index = ["0", "1","2"], 
                     columns = ["0", "1","2"])
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

print ('Classification Report : ')
print (classification_report(y_test, predicted_labels, target_names=["0", "1","2"]))

pred_vs_actual_lr = rightly_identified(y_test, predicted_labels)
pred_vs_actual_lr

# Part B

## Q1 Data Understanding and Preparation

In [None]:
#Q1.A
Data1 = pd.read_csv('Part2+-+Data1.csv') #reading the dataset
Data2 = pd.read_csv('Part2+-Data2.csv')

In [None]:
#Q1.B
#shape
print('Shape of Data1:', Data1.shape)
print('Shape of Data2:', Data2.shape)

#columns
print('Columns of Data1:', Data1.columns)
print('Columns of Data2:', Data2.columns)

#DataTypes
print('DataTypes of Data1:', Data1.info())
print('DataTypes of Data2:', Data2.info())

In [None]:
#Q1.C
result = pd.merge(Data1, Data2, on=["ID"]) #inner join basis id column

In [None]:
result.tail(5)

In [None]:
result.shape

In [None]:
result.info()

In [None]:
#Q1.D
convert_dict = {'CreditCard' : object,
                'InternetBanking': object,
                'FixedDepositAccount': object,
                'Security': object, 
                'Level': object,
                'HiddenScore': object
               }
  
result = result.astype(convert_dict)
print(result.dtypes)

In [None]:
result.head(5)

## Q2. Data Exploration and Analysis

In [None]:
result.shape

In [None]:
#Q2.A
result['LoanOnCard'].hist()
result.groupby("LoanOnCard").agg({'LoanOnCard': 'count'})

In [None]:
#Insight

    1. The above graph shows that the dataset is higly imbalanced with 90% of the accounts haven't taken loan on the card.
    2. Also, 20 accounts don't have any entry for 'LoanOnCard' feature 

In [None]:
#Q2.B
# result.isnull().sum()
percent_missing = result.isnull().sum() * 100 / len(result)
print(percent_missing)


result['LoanOnCard'].fillna(result['LoanOnCard'].mode()[0], inplace=True)
print(result.isnull().sum())

0.40 % of the data is missing for "LoanOnCard" feature and for the other features there are no nulls.

In [None]:
result.info()

In [None]:
#Q5.C
list_columns= ['HiddenScore','Level','Security','FixedDepositAccount','InternetBanking','CreditCard']
for i in list_columns:
    print(i)
    print(result[i].unique())

Insights:
There are no unexpected values which needs treatment.

In [None]:
if "NAN" in result.values:
        print('Element exists in Dataframe')
        
if "?" in result.values:
        print('Element exists in Dataframe')

In [None]:
for i in result.columns:
    print(i)
    print(result[i].unique())

## Q3. Data Preparation and model building

In [None]:
#Data Exploration

In [None]:
result.var()

In [None]:
sns.countplot(result['FixedDepositAccount'],label="Count")

In [None]:
sns.countplot(result['Security'],label="Count")

In [None]:
#Although above variables have low variance but let's keep it as already the number of variables are low

In [None]:
result.corr()
#Basis this we can remove either Age or Customer Since feature because they are highly correlated

In [None]:
#Q3.A
# Create a separate dataframe consisting only of the features i.e independent attributes
X = result.drop(labels= ['ID','ZipCode','LoanOnCard','CustomerSince'] , axis = 1)
#CustomerSince dropped due to high  multicollinarity with age feature
y = result["LoanOnCard"]

In [None]:
#Q3.B
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
#Q3.C
#Logistic Regression
from sklearn import metrics

from sklearn.linear_model import LogisticRegression

# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(X_train, y_train)
#predict on test
y_predict = model.predict(X_test)


coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)

In [None]:
#Q3.D

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

In [None]:
#AUC ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
# roc curve for logistic regression model with optimal threshold
from numpy import sqrt
from numpy import argmax

logit_roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--',label='No Skill')
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
#G-Mean = sqrt(Sensitivity * Specificity)

In [None]:
#precision_recall_curve
from sklearn.metrics import precision_recall_curve
yhat=model.predict_proba(X_test)[:,1]
# calculate roc curves
precision, recall, thresholds = precision_recall_curve(y_test, yhat)
# convert to f score
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
# no_skill = len(y_test[y_test==1]) / len(y_test)
# plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='Logistic')
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
# show the plot
plt.show()

In [None]:
# predict the response
y_pred = model.predict(X_test)
# evaluate accuracy
print("TEST")
print("Precision: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Score: {}".format(f1_score(y_test, y_pred)))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("AUC-ROC: {}".format(roc_auc_score(y_test, y_pred)))

#### Insights
    1. Model is able to predict very well who will not take loan on card (1100 out of 1113 in test data)
    2. Model looses on accounts who will take loan. It's not able to predict 1's very well. And, hence the recall is also low because False negative is high in number.
    3. F1 is harmonic mean of Precision and recall. It's value is also low because recall is low.
    4. Precision and accuracy of the model is good which might be because of good True positives and low False positives.
    5. The above graphs help on deciding the threshold basis the metric which we consider as important.

### Removal of outliers and applying L2 regularisation on standardised data to improve performance

In [None]:
X.boxplot(figsize = (20,10))

This shows lot of outliers in Mortgage feature which might affect the model performance

In [None]:
##Lot of outliers-Let's remove these to see the impact,
#Implementing the above steps will free the final logistic regression model from extremely misclassified data points.


# Create a separate dataframe consisting only of the features i.e independent attributes
X = result.drop(labels= ['ID','ZipCode','LoanOnCard','CustomerSince'] , axis = 1)
y = result["LoanOnCard"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# to scale the data so as to bring everything on same scale

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)
print(X_train_std.shape)
print(X_test_std.shape)


param_grid = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"],"solver": ["saga","liblinear","warn"]}

classifier = LogisticRegression(random_state=42, n_jobs=-1)
logreg_cv = GridSearchCV(classifier, param_grid, cv=3, scoring='roc_auc').fit(X_train_std, y_train)

In [None]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("roc_auc score :",logreg_cv.best_score_)

In [None]:
params = {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
classifier = LogisticRegression(random_state=42, n_jobs=-1, **params).fit(X_train_std, y_train)
classifier

In [None]:
y_pred = classifier.predict(X_test_std)
y_pred_proba = classifier.predict_proba(X_test_std)[:,1]

In [None]:
np. unique(y_pred)

In [None]:
print("TEST")
print("Precision: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Score: {}".format(f1_score(y_test, y_pred)))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("AUC-ROC: {}".format(roc_auc_score(y_test, y_pred_proba)))

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

In [None]:
weight_vector = list(classifier.coef_[0])
weight_vector

In [None]:
dist = np.dot(X_train_std, weight_vector)
y_dist = dist*[-1 if x==0 else 1 for x in list(y_train)]
len(y_dist)

In [None]:
print(y_train.value_counts())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(y_dist)
plt.xlabel("Distance * Y-class")
plt.grid()
plt.show()

In [None]:
val = np.percentile(y_dist, 5) #remove above and below 5 percentile value
print("Threshold Val: ", val)

y_train[(y_dist < val)].value_counts()

In [None]:
X_train_std_new = X_train_std[(~(y_dist < val))]
y_train_new = y_train[(~(y_dist < val))]
print(X_train_std_new.shape)
print(y_train_new.shape)

In [None]:
y_train_new.value_counts()

In [None]:
#Let's train the same model on new data after removal of outliers
params = {'C': 0.1, 'penalty': 'l2','solver':'liblinear'}
classifier1 = LogisticRegression(random_state=42, n_jobs=-1, **params).fit(X_train_std_new, y_train_new)
classifier1

In [None]:
y_pred = classifier1.predict(X_test_std)
y_pred_proba = classifier1.predict_proba(X_test_std)[:,1]

In [None]:
print("TEST")
print("Precision: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Score: {}".format(f1_score(y_test, y_pred)))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("AUC-ROC: {}".format(roc_auc_score(y_test, y_pred_proba)))

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

##### Model performance without L2, OUTLIER treatment and standardisation.

TEST
1. Precision: 0.8586956521739131
2. Recall: 0.5895522388059702
3. F1 Score: 0.6991150442477877
4. Accuracy: 0.9456
5. AUC-ROC: 0.7889517466431285

#### Insights:

By applying l2 regularisation we have also considered the fact that overfitting must be regularised and removal of outliers have helped in increasing the  recall from 0.58 to 0.76 and F1_Score  from 0.69 to 0.75 compared to without any treatment.

Recall is important for this problem as we don't want to miss out on "1" for marketing.

From confusion matrix we can say True positives have increased while false negative has decreased 


In [None]:
#Q3.E
# Shuffle the Dataset.
shuffled_df = result.sample(frac=1,random_state=4)

# Put all the "1" class in a separate dataset.
Loan = result.loc[result['LoanOnCard'] == 1]

#Randomly select 480 observations from the 0 (majority class)
No_Loan = shuffled_df.loc[result['LoanOnCard'] == 0].sample(n=480,random_state=42)

# Concatenate both dataframes again
normalized_df = pd.concat([Loan, No_Loan])

#plot the dataset after the undersampling
plt.figure(figsize=(8, 8))
sns.countplot('LoanOnCard', data=normalized_df)
plt.title('Balanced Classes')
plt.show()

In [None]:
normalized_df.groupby("LoanOnCard").agg({'LoanOnCard': 'count'})

In [None]:
#Q3.F
X = normalized_df.drop(labels= ['ID','ZipCode','LoanOnCard','CustomerSince'] , axis = 1)
y = normalized_df["LoanOnCard"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# to scale the data
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)
print(X_train_std.shape)
print(X_test_std.shape)

# Training previous model on balanced data
params = {'C': 0.1, 'penalty': 'l2','solver':'liblinear'}
classifier1 = LogisticRegression(random_state=42, n_jobs=-1, **params).fit(X_train_std, y_train)
classifier1


y_pred = classifier1.predict(X_test_std)
y_pred_proba = classifier1.predict_proba(X_test_std)[:,1]

In [None]:
#Q3.G

print("TEST")
print("Precision: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Score: {}".format(f1_score(y_test, y_pred)))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
print("AUC_ROC: {}".format(roc_auc_score(y_test, y_pred)))

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

### Insights

When we fitted the balanced dataset on the same model, it performed well on the below metrics

    1. Precision: 0.73 to 0.81
    2. Recall: 0.76 to 0.88
    3. F1 Score: 0.75 to 0.84
    4. From confusion matrix we can say false negative and false positives have decreased
    
    Note: Please find below previouus model performance

####### this is the perfomance of model on test data after removal of outliers
    1. Precision: 0.7357142857142858
    2. Recall: 0.7686567164179104
    3. F1 Score: 0.7518248175182483
    4. Accuracy: 0.9456
    5. AUC-ROC: 0.9671066709463436

## Q4. Performance Improvement

In [None]:
# Create a separate dataframe consisting only of the features i.e independent attributes
X = result.drop(labels= ['ID','ZipCode','LoanOnCard','CustomerSince'] , axis = 1)
#CustomerSince dropped due to high  multicollinarity
y = result["LoanOnCard"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
#Q4.A 

#SVM
from sklearn import svm
clf = svm.SVC(gamma=0.025, C=1000) 

In [None]:
clf.fit(X_train , y_train)

In [None]:
print("Accuracy on training set: {:.2f}".format(clf.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(clf.score(X_test, y_test)))

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_grid = (np.column_stack([y_test, y_pred]))

In [None]:
pd.set_option('display.max_columns', 26)

pd.crosstab(y_pred, y_test)

In [None]:
# predict the response
y_pred = clf.predict(X_test)
# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('f1 Score:',f1_score(y_test, y_pred))

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

In [None]:
#KNN

NNH = KNeighborsClassifier(n_neighbors= 1 , weights = 'distance' )
# Call Nearest Neighbour algorithm

NNH.fit(X_train, y_train)

# For every test data point, predict it's label based on 5 nearest neighbours in this model. The majority class will 
# be assigned to the test data point

predicted_labels = NNH.predict(X_test)
NNH.score(X_test, y_test)

# predict the response
y_pred = NNH.predict(X_test)
# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('f1 Score:',f1_score(y_test, y_pred))

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

In [None]:
#Q4.B

In [None]:
#if we standardize it and then work
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

#Increase C to avoid overfitting
svc = svm.SVC(C=1000)
svc.fit(X_train_scaled, y_train)
y_pred = svc.predict(X_test_scaled)

# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('F1 Score:',f1_score(y_test, y_pred))

In [None]:
svc

In [None]:
#Trying snother kernel
from sklearn.svm import SVC

# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train_scaled, y_train)

prediction = svc_model .predict(X_test_scaled)


print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))

# predict the response
y_pred = svc_model.predict(X_test_scaled)

# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('F1 Score:',f1_score(y_test, y_pred))

Performance is bad compared to the above model basis f1, recall and accuracy

We can see that the best model from SVM is when C=1000 and Kernel = rbf

In [None]:
#Now let's tune KNN
# KNN - choosing the K value

# creating odd list of K for KNN
myList = list(range(2,20))

# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))

# empty list that will hold accuracy scores
ac_scores = []

# perform accuracy metrics for values from 1,3,5....19
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k,weights='distance')
    knn.fit(X_train, y_train)
    # predict the response
    y_pred = knn.predict(X_test)
    # evaluate accuracy
    scores = accuracy_score(y_test, y_pred)
    ac_scores.append(scores)

# changing to misclassification error
MSE = [1 - x for x in ac_scores]

# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

In [None]:
# KNN - Model using the best parameters form above

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

KNN = KNeighborsClassifier(n_neighbors=5,weights='distance')
KNN.fit(X_train, y_train)
# predict the response
y_pred = KNN.predict(X_test)
# evaluate accuracy
KNN_Accuracy=accuracy_score(y_test, y_pred)
print("\nAccuracy using KNN : ", KNN_Accuracy)

In [None]:
# predict the response
y_pred = KNN.predict(X_test)
# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('F1 Score:',f1_score(y_test, y_pred))

In [None]:
KNN

In [None]:
#Let's standardise this and see
from scipy.stats import zscore

In [None]:
X.info()
X = X.astype(float)
print(X.dtypes)

In [None]:
X.head()

In [None]:
XScaled  = X.apply(zscore)

In [None]:
# Split X and y into training and test set in 75:25 ratio

X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.25, random_state=1)

In [None]:
#KNN
# KNN - choosing the K value after standardising

# creating odd list of K for KNN
myList = list(range(2,20))

# subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))

# empty list that will hold accuracy scores
ac_scores = []

# perform accuracy metrics for values from 1,3,5....19
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k,weights='distance')
    knn.fit(X_train, y_train)
    # predict the response
    y_pred = knn.predict(X_test)
    # evaluate accuracy
    scores = accuracy_score(y_test, y_pred)
    ac_scores.append(scores)

# changing to misclassification error
MSE = [1 - x for x in ac_scores]

# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

In [None]:
NNH = KNeighborsClassifier(n_neighbors= 3 , weights = 'distance' )
# Call Nearest Neighbour algorithm

NNH.fit(X_train, y_train)

# For every test data point, predict it's label based on 5 nearest neighbours in this model. The majority class will 
# be assigned to the test data point

predicted_labels = NNH.predict(X_test)
NNH.score(X_test, y_test)

# predict the response
y_pred = NNH.predict(X_test)
# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('F1 Score:',f1_score(y_test, y_pred))

### Insights 

Finalising the SVM model (SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)) built on scaled data. Scaling has drastically improved the performance

Accuracy Score: 0.9784
Recall Score: 0.917910447761194
Precision Score: 0.8848920863309353
F1 Score: 0.9010989010989011

In [None]:
#Q4.C
#if we standardize it and then work
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# predict the response
y_pred = svc.predict(X_test_scaled)
# evaluate accuracy
print('Accuracy Score:',accuracy_score(y_test, y_pred))
print('Recall Score:',recall_score(y_test, y_pred))
print('Precision Score:',precision_score(y_test, y_pred))
print('f1 Score:',f1_score(y_test, y_pred))

In [None]:
#Confusion Matrix
cm=metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                  columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)

### Q4.D
1. Improvement in terms of recall, accuracy,precision and f1 score has been observed. 
2. Also, true positive has increased and false negative&false positives has further decreased comparitively

Old Base SVM results for comparison

#SVM performance
    1. Accuracy Score: 0.928
    2. Recall Score: 0.47761194029850745
    3. Precision Score: 0.7619047619047619
    4. f1 Score: 0.58
    
Old Base KNN results for comparison 

#KNN performance
    1. Accuracy Score: 0.9016
    2. Recall Score: 0.4701492537313433
    3. Precision Score: 0.5478260869565217
    4. f1 Score: 0.5060240963855422