## 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report

import warnings
warnings.filterwarnings('ignore')

## 2. Importing Dataset

In [2]:
bank_data = pd.read_csv(r'D:\Assignment ExcelR\Logistic Regression\bank-full.csv', delimiter=';')
bank_data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\Assignment ExcelR\\Logistic Regression\\bank-full.csv'

## 3. Data Understanding

In [None]:
bank_data.shape

In [None]:
bank_data.columns

In [None]:
bank_data.info()

In [None]:
bank_data.describe()

In [None]:
bank_data.var()

In [None]:
bank_data.skew()

In [None]:
bank_data.kurt()

### Missing Values

In [None]:
bank_data.isnull().sum()

### Duplicated Values

In [None]:
bank_data[bank_data.duplicated()].shape

### Let's find how many discrete and continuous feature are their in our dataset by seperating them in variables

In [None]:
discrete_feature = [feature for feature in bank_data.columns if len(bank_data[feature].unique())<20 and feature]
print('Discrete Variables Count: {}'.format(len(discrete_feature)))

In [None]:
continuous_feature = [feature for feature in bank_data.columns if bank_data[feature].dtype!='O' and feature not in discrete_feature]
print('Continuous Feature Count {}'.format(len(continuous_feature)))

## 4. Exploratory Data Analysis

In [None]:
bank_data.corr()

In [None]:
fig = plt.figure(figsize= (9,6))
sns.heatmap(bank_data.corr(), annot=True)
plt.xticks(rotation=45)
plt.show()

### Lets analyze the continuous values by creating histograms to understand the distribution of the numerical features

In [None]:
for feature in continuous_feature:
    bank_data1 = bank_data.copy()
    bank_data1[feature].hist(bins=15)
    plt.ylabel('Count')
    plt.title(feature)
    plt.show()

### Log transformation

In [None]:
for feature in continuous_feature:
    bank_data2 = bank_data.copy()
    if 0 in bank_data2[feature].unique():
        pass
    else:
        bank_data2[feature] = np.log(bank_data2[feature])
        bank_data2[feature].hist(bins=15)
        plt.ylabel('Count')
        plt.title(feature)
        plt.show()

### Outliers Detection

In [None]:
outlier = bank_data.copy() 
fig, axes = plt.subplots(7,1,figsize=(10,8), sharex=False, sharey=False)
sns.boxplot(x='age',data=outlier,palette='crest',ax=axes[0])
sns.boxplot(x='balance',data=outlier,palette='crest',ax=axes[1])
sns.boxplot(x='day',data=outlier,palette='crest',ax=axes[2])
sns.boxplot(x='duration',data=outlier,palette='crest',ax=axes[3])
sns.boxplot(x='campaign',data=outlier,palette='crest',ax=axes[4])
sns.boxplot(x='pdays',data=outlier,palette='crest',ax=axes[5])
sns.boxplot(x='previous',data=outlier,palette='crest',ax=axes[6])
plt.tight_layout(pad=2.0)

#### There are lot of outliers present in the dataframe but we can't drop them because they are present in a very large quantity and can be important for model building
### After Log-Transformation

In [None]:
for feature in continuous_feature:
    bank_data3 = bank_data.copy()
    bank_data3[feature] = np.log(bank_data3[feature])
    bank_data3.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

## 5. Data Preprocessing

In [None]:
bank_data[['job','marital','education','default','housing','loan','contact','poutcome','month','y']] = bank_data[
    ['job','marital','education','default','housing','loan','contact','poutcome','month','y']].astype('category')
bank_data_new = bank_data

In [None]:
bank_data.info()

### Label Encoding

In [None]:
bank_data_new['month'] = bank_data_new['month'].cat.codes
bank_data_new['job'] = bank_data_new['job'].cat.codes
bank_data_new['marital'] = bank_data_new['marital'].cat.codes
bank_data_new['education'] = bank_data_new['education'].cat.codes
bank_data_new['default'] = bank_data_new['default'].cat.codes
bank_data_new['housing'] = bank_data_new['housing'].cat.codes
bank_data_new['loan'] = bank_data_new['loan'].cat.codes
bank_data_new['contact'] = bank_data_new['contact'].cat.codes
bank_data_new['poutcome'] = bank_data_new['poutcome'].cat.codes
bank_data_new['y'] = bank_data_new['y'].cat.codes

## 6. Model Building

In [None]:
x1 = bank_data_new.drop('y', axis=1)
y1 = bank_data_new[['y']]

In [None]:
x1

In [None]:
y1

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x1,y1,test_size=0.20,random_state=12)
print("Shape of X_train : ",x_train.shape)
print("Shape of X_test  : ",x_test.shape)
print("Shape of y_train : ",y_train.shape)
print("Shape of y_test  : ",y_test.shape)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train,y_train)

In [None]:
logistic_model.coef_

In [None]:
logistic_model.intercept_

#### MinMaxScaler

In [None]:
scalar = MinMaxScaler(feature_range= (0,1))
scalar.fit(bank_data_new)
scaled_x = scalar.transform(bank_data_new)

In [None]:
scaled_x

In [None]:
classifier1 = LogisticRegression()
classifier1.fit(scaled_x,y1)

In [None]:
classifier1.coef_

In [None]:
proba1 = classifier1.predict_proba(scaled_x)
proba1

In [None]:
y_pred1 = classifier1.predict(scaled_x)
y_pred1

## 7. Model Testing || 8. Model Evaluation

### Train Data

In [None]:
y_pred_train1 = logistic_model.predict(x_train)

In [None]:
print(confusion_matrix(y_train, y_pred_train1))

In [None]:
print(classification_report(y_train,y_pred_train1))

In [None]:
accuracy_score(y_train,y_pred_train1)

In [None]:
fpr, tpr, thresholds = roc_curve(y_train,logistic_model.predict_proba (x_train)[:,1])

auc = roc_auc_score(y_train,logistic_model.predict_proba (x_train)[:,1])
print('AUC score : {:.2f}%'.format(auc*100))

plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
classification_report1 = classification_report(y_train,y_pred_train1)
print(classification_report1)

### Test Data

In [None]:
y_pred_test1 = logistic_model.predict(x_test)

In [None]:
print(confusion_matrix(y_test,y_pred_test1))

In [None]:
print(classification_report(y_test,y_pred_test1))

In [None]:
accuracy_score(y_test,y_pred_test1)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test,logistic_model.predict_proba (x_test)[:,1])

auc = roc_auc_score(y_test,logistic_model.predict_proba (x_test)[:,1])
print('AUC score : {:.2f}%'.format(auc*100))

plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
classification_report2 = classification_report(y_test,y_pred_test1)
print(classification_report2)

### Compare the train-set and test-set accuracy

#### Check for overfitting and underfitting (the scores on training and test set)

In [None]:
print('Training set score : {:.2f}%'.format(logistic_model.score(x_train, y_train)*100))
print('Test set score     : {:.2f}%'.format(logistic_model.score(x_test, y_test)*100))

## 9. Model Deployement

In [None]:
from pickle import dump

In [None]:
dump(logistic_model,open('Bank_Prediction.pkl','wb'))

In [None]:
from pickle import load

In [None]:
loaded_logistic_model = load(open('Bank_Prediction.pkl','rb'))

In [None]:
y_pred = loaded_logistic_model.predict(x_test)