### Dataset description
link: https://archive.ics.uci.edu/dataset/222/bank+marketing

*The data is related with direct marketing campaigns of a Portuguese banking institution. \
The marketing campaigns were based on phone calls.* 

### Notebook overview

- Data analysis
- Data preparation
- Dimensionality reduction techniques

_______

### Imports and auxiliary functions

In [3]:
# imports

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

In [5]:
# constants

PRETTY_COLOURS = ('#2934ff', '#c404b1', '#00129c', '#6300bf')
BG_COLOUR = '#efebf0'

# functions

def calc_tagert_feat_perc_of_no(df, in_feature, feature_cat):
    '''
    Calculates pesentage of negative class ("no" value) in target variable
    for a cirtain category in a feature.
    Example: Percent of "no" value in target variable for "job" feature, category "unemployed."
    '''
    cat_perc = df[in_feature][(df[in_feature] == feature_cat) \
               & (df.y == 'no')].count()
    total_cat = df[in_feature][df[in_feature] == feature_cat].count()
    
    return round(cat_perc /total_cat * 100, 2)

def calc_perc_of_cat_in_feat(df, feature_name, cat_name):
    '''
    Calculates pesentage of categoty in feature
    '''
    
    return round(df[feature_name][data[feature_name] == cat_name].value_counts().sum() \
                 / len(df[feature_name]) * 100, 2)

# helper functions



def plot_num_feature(feature_name):
    feature_name_cap = feature_name.capitalize()

    fig = make_subplots(rows=1, cols=3, 
                            subplot_titles=[f'{feature_name_cap} histogram',
                                            f'{feature_name_cap} boxplot',
                                            f'Target variable / {feature_name_cap}'])

    fig.add_trace(go.Histogram(x=data[feature_name], 
                               marker_color=PRETTY_COLOURS[0]),
                               row=1, col=1)

    
    fig.add_trace(go.Box(y=data[feature_name],
                        marker_color=PRETTY_COLOURS[1], name=''),
                        row=1, col=2)
    
    fig.add_trace(go.Violin(x=data.y[data.y == 'no'],
                            y=data[feature_name][data.y == 'no'],
                            line_color=PRETTY_COLOURS[2]), row=1, col=3)
    
    fig.add_trace(go.Violin(x=data.y[data.y == 'yes'],
                            y=data[feature_name][data.y == 'yes'],
                            line_color=PRETTY_COLOURS[3]), row=1, col=3)

    fig.update_xaxes(title_text=feature_name_cap, row=1, col=1)
    fig.update_xaxes(title_text=feature_name_cap, row=1, col=2)
    fig.update_xaxes(title_text='Subscribed a term deposit', row=1, col=3)
    

    fig.update_yaxes(title_text='Count', title_standoff=0, row=1, col=1)
    fig.update_yaxes(title_text='Count', title_standoff=0, row=1, col=2)
    fig.update_yaxes(title_text=feature_name_cap, title_standoff=0, row=1, col=3)
    
    fig.update_layout(plot_bgcolor=BG_COLOUR,
                      showlegend=False)

    fig.show()


def plot_feature_target_hist(df, feature_name):
    '''
    Plots histogram for a feature, coloured by tagret variable distribution.
    '''
    fig = px.histogram(df[feature_name], color=df['y'], color_discrete_sequence=PRETTY_COLOURS[:2])

    fig.update_layout(plot_bgcolor=BG_COLOUR)
    fig.show()

### Exploratory Data Analysis

#### Overview

In [15]:
# read data
data = pd.read_csv('../datasets/bank-additional-full.csv', sep=';')

# drop duplicates
data.drop_duplicates(inplace=True)

data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41176 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41176 non-null  int64  
 1   job             41176 non-null  object 
 2   marital         41176 non-null  object 
 3   education       41176 non-null  object 
 4   default         41176 non-null  object 
 5   housing         41176 non-null  object 
 6   loan            41176 non-null  object 
 7   contact         41176 non-null  object 
 8   month           41176 non-null  object 
 9   day_of_week     41176 non-null  object 
 10  duration        41176 non-null  int64  
 11  campaign        41176 non-null  int64  
 12  pdays           41176 non-null  int64  
 13  previous        41176 non-null  int64  
 14  poutcome        41176 non-null  object 
 15  emp.var.rate    41176 non-null  float64
 16  cons.price.idx  41176 non-null  float64
 17  cons.conf.idx   41176 non-null  floa

Seems like no missing values here at first glance. 

In [18]:
# describe numerical features

data.describe(include='number')

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0,41176.0
mean,40.0238,258.315815,2.567879,962.46481,0.173013,0.081922,93.57572,-40.502863,3.621293,5167.03487
std,10.42068,259.305321,2.770318,186.937102,0.494964,1.570883,0.578839,4.62786,1.734437,72.251364
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [19]:
# Describe categorical features

data.describe(include='object')

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
count,41176,41176,41176,41176,41176,41176,41176,41176,41176,41176,41176
unique,12,4,8,3,3,3,2,10,5,3,2
top,admin.,married,university.degree,no,yes,no,cellular,may,thu,nonexistent,no
freq,10419,24921,12164,32577,21571,33938,26135,13767,8618,35551,36537


#### Target variable analysis

y: has the client subscribed a term deposit? (binary: "yes","no")