## Instituto Politécnico de Coimbra
## Instituto Superior de Engenharia de Coimbra
## Mestrado em Engenharia Informática - Machine Learning
## Elaborado por:

# André Proença 2016018783

# Isabel Castro 2018013160

## DATA SET ORIGINAL
## [https://archive.ics.uci.edu/ml/datasets/Bank+Marketing](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing)

In [None]:
import seaborn as sea
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import string
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import random

## Used functions

In [None]:
def randcolor(number):
    lista = []
    for i in range(number):
        color = "%06x" % random.randint(0, 0xFFFFFF)
        lista.append(color)
    return lista

In [None]:
def plotFigureBoxPlot(columnName,dataset):
    plt.figure(figsize=(6,9))
    sea.boxplot(x=columnName, data=dataset, color='green')
    plt.title("Boxplot of {}" .format(columnName),size=20,color="red")
    plt.xlabel("{}".format(columnName),size=15)

In [None]:
def numericAnalysis(columnName, dataframe):
    item = dataframe[columnName]

    print("Mean:\t", item.mean())
    print("Mode:\t", item.mode())
    print("Median:\t", item.median())
    print("Variance:\t", item.var())
    print("Std deviation:\t", item.std())
    print("Percentils (25, 50, 75):\t", item.quantile([0, 0.25, 0.5, 0.75, 1]))

In [None]:
def plotBarChart(dataset, columnName):
    sea.set(style='whitegrid', palette="bright", font_scale=1.1, rc={"figure.figsize": [15, 10]})
    if(dataset[columnName].dtype != 'object'):
        sea.histplot(x=columnName, data=dataset, bins=np.arange(0, 100, 5), kde=True)
    else:
        sea.histplot(x=columnName, data=dataset, bins=np.arange(0, 100, 5))
    plt.title(string.capwords(columnName) + " " + "distribution")

In [None]:
def plotPieChart(data, labels, title, color=None):
    
    
    fig1, ax1 = plt.subplots()
    if (color == None):
        plt.style.use('seaborn-pastel')
    ax1.pie(data,
            labels=labels,
            autopct="%.1f%%",
            startangle=90,
            colors=color,
            pctdistance=0.85)

    fig = plt.gcf()
    ax1.axis('equal')
    plt.tight_layout()
    plt.title(title)

In [None]:
def plotBarChartByAgeRange(dataset, columnName, label, title):
    ageRange = list(range(15, 95, 5))

    plt.figure(figsize=(18, 25))
    plt.subplot(3, 2, 1)
    dataset.groupby(pd.cut(dataset.age, ageRange))[columnName].mean().plot.bar()
    plt.ylabel(label)
    plt.title(title)


## Dataset reading

In [None]:
fullDataset = pd.read_csv('bank-full.csv', sep=';')

### Resampling Imbalanced Dataset

In [None]:
X = fullDataset.drop('y',axis=1)
Y = fullDataset['y']

xData,xdataa,yData,ydataa = train_test_split(X,Y,train_size=0.22,stratify=Y)

yData=pd.DataFrame(yData,columns=['y'])
dataset = xData

In [None]:
# VAMOS DAR DROP A TODAS AS LINHAS COM BALANCE NEGATIVO E VAMOS TAMBEM DAR DROP A LINHAS COM BALANCE MUITO ALTOS
dataset.drop(dataset[(dataset['balance']>40000)|(dataset['balance']<0)].index,inplace=True,axis=0)

In [None]:
#Remove data where duration is bigger than 2500
dataset.drop(dataset[dataset['duration']>2500].index,inplace=True,axis=0)

In [None]:
#Remove data where campaign is bigger than 35
dataset.drop(dataset[dataset['campaign']>35].index,axis=0,inplace=True)

In [None]:
#Remove all pdays data
dataset.drop("pdays",inplace=True,axis=1)

In [None]:
#Remove all data where previous is bigger than 30
dataset.drop(dataset[dataset['previous']>30].index,axis=0,inplace=True)

## Stratified sampling


In [None]:

# Divide dataset between features e target


# Test and train must have similar loans value

dataset['binary'] = dataset['loan'].astype(str) + dataset['default'].astype(str)

x_train_set, x_test_set , y_train, y_test  = train_test_split(dataset,yData,train_size=0.8, random_state=0, stratify=dataset[['binary']])

# Remove binary column because it was only an aux for the split
x_train_set = x_train_set.drop("binary",axis=1)
x_test_set = x_test_set.drop("binary",axis=1)

## Enconding categorical data

### Pipeline

In [None]:
num_cat = ['age','balance','day','campaign','previous','duration']
ordinal_cat = ['month','contact','poutcome']
one_hot_cat = ['job','marital','education','default','housing','loan']

num_pipe = Pipeline([('scaler',StandardScaler())])

categorical_transformer = Pipeline([('onehot',OneHotEncoder(handle_unknown='ignore', sparse=False, drop='if_binary'))])

cat_pipe_ordinal = Pipeline([('ordinal_encoder',OrdinalEncoder())])

preprocessor = ColumnTransformer([('num_enc',num_pipe,num_cat),('cat_enc',categorical_transformer,one_hot_cat),('ord_enc',cat_pipe_ordinal,ordinal_cat)])

x_train_set = preprocessor.fit_transform(x_train_set)
x_test_set = preprocessor.fit_transform(x_test_set)

### Without pipeline

In [None]:
#Binary Encoding

list_bin = ['default','housing','loan']
for i in list_bin:
    dataset[i] = LabelBinarizer().fit_transform(dataset[i].values)

#Ordinal Encoding

list_ordinal=['month','contact','poutcome']
for i in list_ordinal:
    dataset[i]=LabelEncoder().fit_transform(dataset[i].values)

# Dummy Encoding

dataset = pd.get_dummies(dataset, columns= ['job','marital','education'])

# Reset Index and check dataset again

dataset = dataset.reset_index()
dataset.drop('index', axis=1, inplace=True)

dataset['binary'] = dataset['loan'].astype(str) + dataset['default'].astype(str)

x_train_set_np, x_test_set_np , y_train_set_np, y_test_set_np = train_test_split(dataset,datasetTarget,test_size=0.2, random_state=0, stratify=dataset[['binary']])

# Binary columns no longer needed
x_train_set_np = x_train_set_np.drop("binary",axis=1)
x_test_set_np = x_test_set_np.drop("binary",axis=1)


#Scaling

ss= StandardScaler()

x_train_set_np_org = ss.fit_transform(x_train_set_np)
x_test_set_np_org = ss.fit_transform(x_test_set_np)

x_train_set_np_sc = ss.fit_transform(x_train_set_np[['age','balance','day','duration','campaign','previous']])
x_test_set_np_sc = ss.transform(x_test_set_np[['age','balance','day','duration','campaign','previous']])


## Unsupervised Learning Clustering


### KMeans

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

numOfClusters = 10
sse = [0] * numOfClusters
for i in range(0,numOfClusters):
    kMeans = KMeans(n_clusters=i + 1, random_state=0).fit(xTrainScaled)
    sse[i] = kMeans.inertia_

sseNp = np.array(sse)
sseNp.sum(), sseNp

In [None]:
#Imprime o valor de precisão do algoritmo de teste
def testAccuracy(pred):
    count=0
    for i in range (len(pred)):
        if ytest[i]==pred[i]:
            count=count+1
    print("Accuracy:",count/len(pred)) 

In [None]:
#Pipeline com K-neighbours como classificador
def pipelineKNN():
    pipeline=Pipeline([
            ("cluster",function()),
            ("KNeighbors",KNeighborsClassifier(n_neighbors=21,leaf_size=100)),           
            ])
    pipeline.fit(xtrain,ytrain)
    print("Accuracy Pipeline:",pipeline.score(xtest,ytest))

In [None]:
kMeans = KMeans(n_clusters=5, random_state=0)
kMeans.fit(x_train_set)
yKmeans = kMeans.predict(x_test_set)
silhouette_score(x_test_set, yKmeans)


km=KMeans(n_clusters=2)
km.fit(x_train_set)
pred=km.predict(x_test_set)
print("K-mean Scores\n-------------")
testAccuracy(pred)
pipelineKNN(lambda:KMeans(n_clusters=2))
    
    
    

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score, normalized_mutual_info_score, homogeneity_score, completeness_score

groups = kMeans.labels_
groups

print('Homogeneity: ', homogeneity_score(y, groups))
print('Completeness: ', completeness_score(y, groups))
print('ARI: ', adjusted_rand_score(y, groups))
print('NMI: ', normalized_mutual_info_score(y, groups))

### Hierarchical clustering - Agglomerative

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
#Imprime o valor de precisão do algoritmo de treino
def trainAccuracy(pred):
    count=0
    for i in range (len(pred)):
        if ytrain[i]==pred[i]:
            count=count+1
    print("Accuracy:",count/len(pred))  

In [None]:
from sklearn.cluster import AgglomerativeClustering
aggClustering = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(x_train_set)
aggClustering.labels_ 

In [None]:
#from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py
from scipy.cluster.hierarchy import dendrogram
plt.title("Hierarchical Clustering Dendrogram")
plot_dendrogram(aggClustering, truncate_mode="level", p=2)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [None]:
agg=AgglomerativeClustering(n_clusters=2)
pred=agg.fit_predict(x_train_set)
print()
print("Agglomerative Clustering Scores\n-------------")
trainAccuracy(pred) 

### MeanShift

In [None]:
from sklearn.cluster import MeanShift
ms = MeanShift(bandwidth=2).fit(x_train_set)
ms.labels_ 
c = ms.predict(x_test_set)
aggClustering.labels_

### Birch

In [None]:
b=Birch(n_clusters=2)
b.fit(x_train_set)
pred=b.predict(x_test_set)
print()
print("Birch Scores\n-------------")
testAccuracy(pred)
pipelineKNN(lambda:Birch(n_clusters=2))

### Spectral

In [None]:
s=SpectralClustering(n_clusters=5)
pred=s.fit_predict(x_train_set)
print()
print("Spectral Clustering Propagation Scores\n-------------")
trainAccuracy(pred)