### Implementation of Logistic Regression, Decision Trees, XGBoost and Random Forest Regressor for Classification Problem

##### 1. Import the necessary libraries as required 
##### 2. Read excel file to dataframe
##### 3. Group categories of 'education' into 'Basic' column
##### 4. Create dummy variables
##### 5. Split data into train and test data
##### 6. Instantiate, fit Logistic Regressor and predict the test data
##### 7. Display confusion matrix and classification report for Logistic Regressor
##### 8. Instantiate, fit DecisionTrees and predict the test data
##### 9. Display confusion matrix and classification report for DecisionTrees
##### 10. Instantiate, fit and XGBoost and predict the test data
##### 11. Display confusion matrix and classification report for XGBoost
##### 12. Instantiate, fit and Random Forest and predict the test data
##### 13. Display confusion matrix and classification report for Random Forest 



In [17]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier 
import xgboost
from xgboost import XGBClassifier
# !pip install xgboost

bank = pd.read_csv("C:/Users/amrut/Documents/UTA documents/Courses/Summer 2020/Data Science/HW/HW3/bank-additional/bank-additional/bank-additional-full.csv")
bank['education']=np.where(bank['education'] =='basic.9y', 'Basic', bank['education'])        #reduce many categories of 'education' column for better modelling
bank['education']=np.where(bank['education'] =='basic.6y', 'Basic', bank['education'])        # group categories into 'Basic' column
bank['education']=np.where(bank['education'] =='basic.4y', 'Basic', bank['education'])
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'] # create dummy variables
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(bank[var], prefix=var, drop_first=True)
    bank1=bank.join(cat_list)
    bank=bank1
bank_vars=bank.columns.values.tolist()
to_keep=[i for i in bank_vars if i not in cat_vars]
bank_final=bank[to_keep]                            # convert list to dataframe
X = bank_final.loc[:, bank_final.columns != 'y']    # training set with all columns except 'y'
y = bank_final.loc[:, bank_final.columns == 'y']    # test set with only column 'y'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)   # split data to 70 train and 30 test
print("------------Logistic Regression------------------")
logreg = LogisticRegression()                       # initialize logistic regressor
logreg.fit(X_train, y_train)                        # fit logistic regressor
y_pred_1 = logreg.predict(X_test)                   # predict test data in logistic regressor
confusion_matrix_1 = confusion_matrix(y_test, y_pred_1) # Instantiate confusion matrix  
print("\nConfusion Matrix:\n {}".format(confusion_matrix_1))
print("\nClassification Report: \n {}".format(classification_report(y_test, y_pred_1)))

print("\n------------Decision Trees------------------")
tree = DecisionTreeClassifier(random_state=0)       # initialize Decision Trees
tree=tree.fit(X_train, y_train)                     # fit logistic regressor
y_pred_2=tree.predict(X_test)                       # predict test data using Decision Trees
confusion_matrix_2 = confusion_matrix(y_test, y_pred_2)  # Instantiate confusion matrix  
print("\nConfusion Matrix:\n {}".format(confusion_matrix_2)) 
print("\nClassification Report: \n {}".format(classification_report(y_test, y_pred_2)))

print("\n------------XGBoost------------------")
classifier = XGBClassifier()                       # initialize XGBClassifier
classifier.fit(X_train, y_train)                   # fit XGBClassifier
y_pred_3=classifier.predict(X_test)                # predict test data using XGBClassifier
confusion_matrix_3 = confusion_matrix(y_test, y_pred_3) # Instantiate confusion matrix  
print("\nConfusion Matrix:\n {}".format(confusion_matrix_3))
print("\nClassification Report: \n {}".format(classification_report(y_test, y_pred_3)))

print("\n------------Random Forest Regressor------------------")
smk = SMOTETomek(random_state=42)                             # Perform SMOTE
os = RandomOverSampler(random_state=42)
X_train_res, y_train_res=os.fit_sample(X_train,y_train)
y_train_res["y"] = y_train_res["y"].map({"no":0,"yes":1})
X_train_res.shape,y_train_res.shape

sc = StandardScaler()                                        # Perform Standard Scalar
X_train = sc.fit_transform(X_train_res)
X_test = sc.transform(X_train_res)
y_train = sc.fit_transform(y_train_res)
y_test = sc.transform(y_train_res)

regressor = RandomForestRegressor()                          # initialize RandomForestRegressor                
regressor.fit(X_train_res, y_train_res)                      # fit RandomForestRegressor

y_pred_4 = regressor.predict(X_test)                         # predict test data using RandomForestRegressor

for i in range(0,len(y_pred_4)):                             # Convert predicted data with probability more than 0.1 to 1 and remaining to 0
    if y_pred_4[i] > 0.1:
        y_pred_4[i]=1
    else:
        y_pred_4[i]=0

for i in range(0,len(y_test)):                               # Convert test data with value -1 to 0 and remaining to 1
    if y_test[i] == -1:
        y_test[i]=0
    else:
        y_test[i]=1

y_test_list=y_test.tolist()                                  # convert array to list

merged_list = []

for l in y_test_list:                                        # convert list to array
    merged_list += l

y_test_array=np.array(merged_list)

confusion_matrix_4 = confusion_matrix(y_test, y_pred_4)      # Instantiate confusion matrix  
print("\nConfusion Matrix:\n {}".format(confusion_matrix_4))
print("\nClassification Report: \n {}".format(classification_report(y_test, y_pred_4)))

------------Logistic Regression------------------

Confusion Matrix:
 [[10700   269]
 [  829   559]]

Classification Report: 
               precision    recall  f1-score   support

          no       0.93      0.98      0.95     10969
         yes       0.68      0.40      0.50      1388

    accuracy                           0.91     12357
   macro avg       0.80      0.69      0.73     12357
weighted avg       0.90      0.91      0.90     12357


------------Decision Trees------------------

Confusion Matrix:
 [[10280   689]
 [  650   738]]

Classification Report: 
               precision    recall  f1-score   support

          no       0.94      0.94      0.94     10969
         yes       0.52      0.53      0.52      1388

    accuracy                           0.89     12357
   macro avg       0.73      0.73      0.73     12357
weighted avg       0.89      0.89      0.89     12357


------------XGBoost------------------

Confusion Matrix:
 [[10558   411]
 [  637   751]]

Class