## Student Name: Brian Mortimer
## Student ID: 20258763

In [3]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn import manifold #needed for multidimensional scaling (MDS) and t-SNE
from sklearn import cluster #needed for k-Means clustering
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, FunctionTransformer #needed for data preparation

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

In [7]:
# Load data (bank.csv)
original_df = pd.read_csv("bank.csv")

In [10]:
original_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,32.0,technician,single,tertiary,no,392,yes,no,cellular,1,apr,957,2,131,2,failure,no
1,39.0,technician,divorced,secondary,no,688,yes,yes,cellular,1,apr,233,2,133,1,failure,no
2,59.0,retired,married,secondary,no,1035,yes,yes,cellular,1,apr,126,2,239,1,failure,no
3,47.0,blue-collar,married,secondary,no,398,yes,yes,cellular,1,apr,274,1,238,2,failure,no
4,54.0,retired,married,secondary,no,1004,yes,no,cellular,1,apr,479,1,307,1,failure,no


In [18]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         1988 non-null   float64
 1   job         1990 non-null   object 
 2   marital     2000 non-null   object 
 3   education   1896 non-null   object 
 4   default     2000 non-null   object 
 5   balance     2000 non-null   int64  
 6   housing     2000 non-null   object 
 7   loan        2000 non-null   object 
 8   contact     1809 non-null   object 
 9   day         2000 non-null   int64  
 10  month       2000 non-null   object 
 11  duration    2000 non-null   int64  
 12  campaign    2000 non-null   int64  
 13  pdays       2000 non-null   int64  
 14  previous    2000 non-null   int64  
 15  poutcome    1546 non-null   object 
 16  subscribed  2000 non-null   object 
dtypes: float64(1), int64(6), object(10)
memory usage: 265.8+ KB


# Task 1: Data Preparation Pipeline

In [None]:
# Copy the original dataframe
df = original_df.copy()

In [21]:
# Fill null values with unknown 
df.fillna({'job':"unknown", "education": "unknown", "poutcome": "unknown"}, inplace=True)

In [None]:
# Convert cyclical features for month and day
month_map = {
    'jan': 0,
    'feb': 1,
    'mar': 2,
    'apr': 3,
    'may': 4,
    'jun': 5,
    'jul': 6,
    'aug': 7,
    'sep': 8,
    'oct': 9,
    'nov': 10,
    'dec': 11
}

df['month_num'] = df['month'].map(month_map)
months_in_year = 12
df['month_sin'] = np.sin(2 * np.pi * df['month_num'] / months_in_year)
df['month_cos'] = np.cos(2 * np.pi * df['month_num'] / months_in_year)

months_days = {
    'jan': 31,
    'feb': 28,
    'mar': 31,
    'apr': 30,
    'may': 31,
    'jun': 30,
    'jul': 31,
    'aug': 31,
    'sep': 30,
    'oct': 31,
    'nov': 30,
    'dec': 31
}

df['max_days'] = df['month'].map(months_days)
df['day_sin'] = np.sin(2 * np.pi * df['day'] / df['max_days'])
df['day_cos'] = np.cos(2 * np.pi * df['day'] / df['max_days'])

# Remove unnecessary columns
df = df.drop(columns=['max_days', "month_num"], axis=1)

In [24]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,duration,campaign,pdays,previous,poutcome,subscribed,month_sin,month_cos,day_sin,day_cos
0,32.0,technician,single,tertiary,no,392,yes,no,cellular,1,...,957,2,131,2,failure,no,1.0,6.123234000000001e-17,0.207912,0.978148
1,39.0,technician,divorced,secondary,no,688,yes,yes,cellular,1,...,233,2,133,1,failure,no,1.0,6.123234000000001e-17,0.207912,0.978148
2,59.0,retired,married,secondary,no,1035,yes,yes,cellular,1,...,126,2,239,1,failure,no,1.0,6.123234000000001e-17,0.207912,0.978148
3,47.0,blue-collar,married,secondary,no,398,yes,yes,cellular,1,...,274,1,238,2,failure,no,1.0,6.123234000000001e-17,0.207912,0.978148
4,54.0,retired,married,secondary,no,1004,yes,no,cellular,1,...,479,1,307,1,failure,no,1.0,6.123234000000001e-17,0.207912,0.978148


In [None]:
# Create the pipeline for preprocessing
set_config(display='diagram')

# Define the categorical numerical, cyclical and ordinal columns
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
num_cols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous', "month_sin", "month_cos", "day_sin", "day_cos"]
cyclical_cols = ['day', 'month']


# Create the categorical transformer
cat_transformer = Pipeline(steps=[
    ('onehot', FunctionTransformer(lambda x: pd.get_dummies(x, drop_first=True), validate=False))
])

# Create the numerical transformer
num_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols),
        ('num', num_transformer, num_cols)
    ]
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', MinMaxScaler())
])

In [15]:
# Fit the pipeline to the data
pipeline.fit(df)

# Transform the data
X_transformed = pipeline.transform(df)

# Task 2: k-Means

# Task 3: DBSCAN

# Conclusion