## Instituto Politécnico de Coimbra
## Instituto Superior de Engenharia de Coimbra
## Mestrado em Engenharia Informática - Machine Learning
## Elaborado por:

# André Proença 2016018783

# Isabel Castro 2018013160

## DATA SET ORIGINAL
## [https://archive.ics.uci.edu/ml/datasets/Bank+Marketing](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)

In [248]:
import seaborn as sea
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import string
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random

## Used functions

In [249]:
def randcolor(number):
    lista = []
    for i in range(number):
        color = "%06x" % random.randint(0, 0xFFFFFF)
        lista.append(color)
    return lista

In [250]:
def plotFigureBoxPlot(columnName,dataset):
    plt.figure(figsize=(6,9))
    sea.boxplot(x=columnName, data=dataset, color='green')
    plt.title("Boxplot of {}" .format(columnName),size=20,color="red")
    plt.xlabel("{}".format(columnName),size=15)

In [251]:
def numericAnalysis(columnName, dataframe):
    item = dataframe[columnName]

    print("Mean:\t", item.mean())
    print("Mode:\t", item.mode())
    print("Median:\t", item.median())
    print("Variance:\t", item.var())
    print("Std deviation:\t", item.std())
    print("Percentils (25, 50, 75):\t", item.quantile([0, 0.25, 0.5, 0.75, 1]))

In [252]:
def plotBarChart(dataset, columnName):
    sea.set(style='whitegrid', palette="bright", font_scale=1.1, rc={"figure.figsize": [15, 10]})
    if(dataset[columnName].dtype != 'object'):
        sea.histplot(x=columnName, data=dataset, bins=np.arange(0, 100, 5), kde=True)
    else:
        sea.histplot(x=columnName, data=dataset, bins=np.arange(0, 100, 5))
    plt.title(string.capwords(columnName) + " " + "distribution")

In [253]:
def plotPieChart(data, labels, title, color=None):
    
    
    fig1, ax1 = plt.subplots()
    if (color == None):
        plt.style.use('seaborn-pastel')
    ax1.pie(data,
            labels=labels,
            autopct="%.1f%%",
            startangle=90,
            colors=color,
            pctdistance=0.85)

    fig = plt.gcf()
    ax1.axis('equal')
    plt.tight_layout()
    plt.title(title)

In [254]:
def plotBarChartByAgeRange(dataset, columnName, label, title):
    ageRange = list(range(15, 95, 5))

    plt.figure(figsize=(18, 25))
    plt.subplot(3, 2, 1)
    dataset.groupby(pd.cut(dataset.age, ageRange))[columnName].mean().plot.bar()
    plt.ylabel(label)
    plt.title(title)


## Dataset reading

In [255]:
fullDataset = pd.read_csv('bank-full.csv', sep=';')

### Resampling Imbalanced Dataset

In [269]:
X = fullDataset.drop('y',axis=1)
Y = fullDataset['y']

xData,xdataa,yData,ydataa = train_test_split(X,Y,train_size=0.22,stratify=Y)

yData=pd.DataFrame(yData,columns=['y'])
dataset = xData

## Features Analysis

### Find the numeric outliers so that we can drop them when we get there

### Age

### Balance

In [258]:
# VAMOS DAR DROP A TODAS AS LINHAS COM BALANCE NEGATIVO E VAMOS TAMBEM DAR DROP A LINHAS COM BALANCE MUITO ALTOS
dataset.drop(dataset[(dataset['balance']>40000)|(dataset['balance']<0)].index,inplace=True,axis=0)

# Remover outliers da duration apos analise do boxplot

In [259]:
#Remove data where duration is bigger than 2500
dataset.drop(dataset[dataset['duration']>2500].index,inplace=True,axis=0)

### Campaign

# Remover outliers da campaign apos analise do boxplot

In [260]:
#Remove data where campaign is bigger than 35
dataset.drop(dataset[dataset['campaign']>35].index,axis=0,inplace=True)

### Pdays

##Remover outlier pdays

In [261]:
#Remove all pdays data
dataset.drop("pdays",inplace=True,axis=1)

### Previous

# Remover outliers previous apos analise do boxplot

In [262]:
#Remove all data where previous is bigger than 30
dataset.drop(dataset[dataset['previous']>30].index,axis=0,inplace=True)

### PoutCome

## Stratified sampling


In [273]:

# Divide dataset between features e target


# Test and train must have similar loans value

dataset['binary'] = dataset['loan'].astype(str) + dataset['default'].astype(str)

x_train_set, x_test_set , y_train, y_test  = train_test_split(dataset,yData,train_size=0.8, random_state=0, stratify=dataset[['binary']])

# Remove binary column because it was only an aux for the split
x_train_set = x_train_set.drop("binary",axis=1)
x_test_set = x_test_set.drop("binary",axis=1)

## Enconding categorical data

### Pipeline

In [274]:
num_cat = ['age','balance','day','campaign','previous','duration']
ordinal_cat = ['month','contact','poutcome']
one_hot_cat = ['job','marital','education','default','housing','loan']

num_pipe = Pipeline([('scaler',StandardScaler())])

categorical_transformer = Pipeline([('onehot',OneHotEncoder(handle_unknown='ignore', sparse=False, drop='if_binary'))])

cat_pipe_ordinal = Pipeline([('ordinal_encoder',OrdinalEncoder())])

preprocessor = ColumnTransformer([('num_enc',num_pipe,num_cat),('cat_enc',categorical_transformer,one_hot_cat),('ord_enc',cat_pipe_ordinal,ordinal_cat)])

x_train_set = preprocessor.fit_transform(x_train_set)
x_test_set = preprocessor.fit_transform(x_test_set)

### Without pipeline

In [None]:
#Binary Encoding

list_bin = ['default','housing','loan']
for i in list_bin:
    dataset[i] = LabelBinarizer().fit_transform(dataset[i].values)

#Ordinal Encoding

list_ordinal=['month','contact','poutcome']
for i in list_ordinal:
    dataset[i]=LabelEncoder().fit_transform(dataset[i].values)

# Dummy Encoding

dataset = pd.get_dummies(dataset, columns= ['job','marital','education'])

# Reset Index and check dataset again

dataset = dataset.reset_index()
dataset.drop('index', axis=1, inplace=True)

dataset['binary'] = dataset['loan'].astype(str) + dataset['default'].astype(str)

x_train_set_np, x_test_set_np , y_train_set_np, y_test_set_np = train_test_split(dataset,datasetTarget,test_size=0.2, random_state=0, stratify=dataset[['binary']])

# Binary columns no longer needed
x_train_set_np = x_train_set_np.drop("binary",axis=1)
x_test_set_np = x_test_set_np.drop("binary",axis=1)


#Scaling

ss= StandardScaler()

x_train_set_np_org = ss.fit_transform(x_train_set_np)
x_test_set_np_org = ss.fit_transform(x_test_set_np)

x_train_set_np_sc = ss.fit_transform(x_train_set_np[['age','balance','day','duration','campaign','previous']])
x_test_set_np_sc = ss.transform(x_test_set_np[['age','balance','day','duration','campaign','previous']])


#### META 2 ####

In [277]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(x_train_set,x_test_set,y_train,y_test)
models

100%|██████████| 29/29 [00:22<00:00,  1.27it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreeClassifier,0.8,0.52,,0.8,0.05
QuadraticDiscriminantAnalysis,0.74,0.51,,0.76,0.05
DecisionTreeClassifier,0.77,0.51,,0.78,0.11
ExtraTreesClassifier,0.88,0.5,,0.83,0.85
LabelSpreading,0.79,0.5,,0.79,5.63
BaggingClassifier,0.87,0.5,,0.82,0.64
LabelPropagation,0.79,0.5,,0.79,4.97
RidgeClassifier,0.88,0.5,,0.83,0.08
RidgeClassifierCV,0.88,0.5,,0.83,0.07
SGDClassifier,0.88,0.5,,0.83,0.11


##SUPERVISED##