In [None]:
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import string

## Used functions

In [None]:
def plotFigureBoxPlot(columnName, title, dataframe):
    sns.boxplot(x=dataframe[columnName])
    plt.title(title)

In [None]:
def features_analysis(columnName, dataframe):
    item = dataframe[columnName];
    print("1. Quantitive measurement on Central tendency:");
    print("mean:\t", item.mean());
    print("mode:\t", item.mode());
    print("median:\t", item.median());

    print("\n2. Quantitive measurement on Variability:");
    print("variance:\t", item.var());
    print("std deviation:\t", item.std());
    print("percentils (25, 50, 75):\t", item.quantile([0, 0.25, 0.5, 0.75, 1]));
    print("skrewness:\t", item.skew());
    print("kurtosis:\t", item.kurtosis());

In [None]:
def plotBarChart(dataset, columnName):
    sns.set(style='whitegrid', palette="bright", font_scale=1.1, rc={"figure.figsize": [8, 5]})
    sns.histplot(x=columnName, data=dataset, bins=np.arange(0, 100, 5))
    plt.title(string.capwords(columnName) + " " + "distribution")


In [None]:
def plotPieChart(data, labels, title, color=None):
    fig1, ax1 = plt.subplots()
    if (color == None):
        plt.style.use('seaborn-pastel')
    ax1.pie(data,
            labels=labels,
            autopct="%.1f%%",
            startangle=90,
            colors=color,
            pctdistance=0.85)

    fig = plt.gcf()
    ax1.axis('equal')
    plt.tight_layout()
    plt.title(title)

## Dataset reading

In [None]:
dataset = pd.read_csv('bank-full.csv', sep=';')

In [None]:
dataset.info()

In [None]:
#Ver missing values
dataset.count()

In [None]:
#Identificar se há null values no dataset
dataset.isnull().sum()

In [None]:
#Identificar se há Nan values no dataset
dataset.isna().sum()

In [None]:
#Identificar possiveis valores duplicados no dataset
print(dataset.duplicated().sum())

In [None]:
dataset.describe()

In [None]:
dataset.y.value_counts()

## Features Analysis

### Age

In [None]:
features_analysis("age", dataset)

In [None]:
plotBarChart(dataset, "age")

In [None]:
sns.boxplot(x=dataset["age"])
plt.title("Age quantiles")

### Job

In [None]:
features_analysis("job", dataset)

In [None]:
jobCounts = dataset["job"].value_counts()
jobCounts

In [None]:
jobCountsPercentage = dataset["job"].value_counts(normalize=True) * 100
jobCountsPercentage

In [None]:
plotBarChart(dataset, "job")

plotPieChart(jobCounts.tolist(), jobCounts.keys(), "job")

### Marital status

In [None]:
features_analysis("marital", dataset)

In [None]:
maritalCounts = dataset["marital"].value_counts()
maritalCounts

In [None]:
maritalCountsPercentage = dataset["marital"].value_counts(normalize=True) * 100
maritalCountsPercentage

In [None]:
plotBarChart(dataset, "marital")

plotPieChart(maritalCounts.tolist(), maritalCounts.keys(), "marital")

### Education

In [None]:
educationCounts = dataset["education"].value_counts()
educationCounts

In [None]:
educationCountsPercentage = dataset["education"].value_counts(normalize=True) * 100
educationCountsPercentage

In [None]:
plotBarChart(dataset, "education")

plotPieChart(educationCounts.tolist(), educationCounts.keys(), "education")

### Default

In [None]:
defaultCounts = dataset["default"].value_counts()
defaultCounts

In [None]:
defaultCountsPercentage = dataset["default"].value_counts(normalize=True) * 100
defaultCountsPercentage

In [None]:
plotBarChart(dataset, "default")

plotPieChart(defaultCounts.tolist(), defaultCounts.keys(), "default")

### Housing

In [None]:
defaultCounts = dataset["default"].value_counts()
defaultCounts

In [None]:
defaultCountsPercentage = dataset["default"].value_counts(normalize=True) * 100
defaultCountsPercentage

In [None]:
plotBarChart(dataset, "default")

plotPieChart(defaultCounts.tolist(), defaultCounts.keys(), "default")

### Loan

In [None]:
loanCounts = dataset["loan"].value_counts()
loanCounts

In [None]:
loanCountsPercentage = dataset["loan"].value_counts(normalize=True) * 100
loanCountsPercentage

In [None]:
plotBarChart(dataset, "loan")

plotPieChart(loanCounts.tolist(), loanCounts.keys(), "loan")

### Contact

In [None]:
contactCounts = dataset["contact"].value_counts()
contactCounts

In [None]:
contactCountsPercentage = dataset["contact"].value_counts(normalize=True) * 100
contactCountsPercentage

In [None]:
plotBarChart(dataset, "contact")

plotPieChart(contactCounts.tolist(), contactCounts.keys(), "contact")

### Month

In [None]:
monthCounts = dataset["month"].value_counts()
monthCounts

In [None]:
monthCountsPercentage = dataset["month"].value_counts(normalize=True) * 100
monthCountsPercentage

In [None]:
plotBarChart(dataset, "month")

plotPieChart(monthCounts.tolist(), monthCounts.keys(), "month")

### Day of week

In [None]:
dayOfWeekCounts = dataset["dayOfWeek"].value_counts()
dayOfWeekCounts

In [None]:
dayOfWeekCountsPercentage = dataset["dayOfWeek"].value_counts(normalize=True) * 100
dayOfWeekCountsPercentage

In [None]:
plotBarChart(dataset, "dayOfWeek")

plotPieChart(dayOfWeekCounts.tolist(), dayOfWeekCounts.keys(), "dayOfWeek")

### Last contact duration

In [None]:
durationCounts = dataset["duration"].value_counts()
durationCounts

In [None]:
durationCountsPercentage = dataset["duration"].value_counts(normalize=True) * 100
durationCountsPercentage

In [None]:
plotBarChart(dataset, "duration")

plotPieChart(durationCounts.tolist(), durationCounts.keys(), "duration")

### Y - has the client subscribed a term deposit

In [None]:
yCounts = dataset["y"].value_counts()
yCounts

In [None]:
yCountsPercentage = dataset["y"].value_counts(normalize=True) * 100
yCountsPercentage

In [None]:
plotBarChart(dataset, "y")

plotPieChart(yCounts.tolist(), yCounts.keys(), "y")

## Pipeline

### Reducing dataset

### Correlation analysis