In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
About this file
Attribute Information:
Input variables (Feature):

Numeric:

1-Id
2-Age
3-Balance: The bank balance of the customer.
4-Day_of_Month: The day of the month on which communication took place
5-Duration: States the duration of the call (in sec) regarding the campaign. This is an important
feature as this attribute highly affects the target label (e.g., if duration=0 then y='no'). Thus, this input will be included for benchmark purposes and will be discarded if the intention is to have a realistic predictive model.
6-Num_contacts_in_campaign: Number of contacts performed during the campaign with the client for subscription
7-Days_since_prev_campaign_contact: Gap (in number of days) between two contacts
8-Num_contacts_prev_campaign: Number of contacts performed for promoting the campaign beforehand

Categorical:
9-Job_type
10-Marital: States the marital status of the customer
11-Education
12-Default: Is the customer a defaulter or not?
{'yes', 'no', 'unknown'}
13-Housing_loan: Has a housing loan or not?
14-Personal_loan: Has a personal loan or not?
15-Communication_type: Mode of communication during the campaign
16-Month: Month in which contact with the customer took place for the campaign
17-Prev_campaign_outcome: The outcome of promotional contact with the client beforehand for attending/interest in the campaign

Output variable (Label):
18-Term_deposit_subscribed: Categorical target label to be predicted {'0, '1'}
"""

"\nAbout this file\nAttribute Information:\nInput variables (Feature):\n\nNumeric:\n\n1-Id\n2-Age\n3-Balance: The bank balance of the customer.\n4-Day_of_Month: The day of the month on which communication took place\n5-Duration: States the duration of the call (in sec) regarding the campaign. This is an important\nfeature as this attribute highly affects the target label (e.g., if duration=0 then y='no'). Thus, this input will be included for benchmark purposes and will be discarded if the intention is to have a realistic predictive model.\n6-Num_contacts_in_campaign: Number of contacts performed during the campaign with the client for subscription\n7-Days_since_prev_campaign_contact: Gap (in number of days) between two contacts\n8-Num_contacts_prev_campaign: Number of contacts performed for promoting the campaign beforehand\n\nCategorical:\n9-Job_type\n10-Marital: States the marital status of the customer\n11-Education\n12-Default: Is the customer a defaulter or not?\n{'yes', 'no', 'u

In [3]:
path1 = 'Train.xlsx'
train_data = pd.read_excel(path1)
train_data.info()

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [None]:
duplicate = train_data[train_data.duplicated(keep='first')]
print("DUPLICATE ROWS IN TRAINING DATASET:",duplicate.shape)

In [None]:
# GETTING THE NULL VALUES
train_data.isnull().sum()

In [None]:
# DROPPING COLUMN 'id' SiNCE IT DOES NOT CONTRIBUTE TO THE TARGET VARIABLE 

# SINCE OUR DATA SAMPLE ARE INDEPENDENT AND IDENTICALLY DISTRIBUTED (IID), THUS WE REMOVE THE id COLUMN, 
# GIVEN THAT ALL SAMPLES COME FROM OR REFER TO THE SAME SOURCE/OBJECT AND THEY DON'T SOMEHOW IDENTIFY THE SAMPLE CLASS.
train_data.drop('id', axis=1, inplace=True)

In [None]:
# DROPPING COLUMN 'days_since_prev_campaign_contact' SINCE IT HAS 80% NULL VALUE.
train_data.drop(['days_since_prev_campaign_contact'], axis=1, inplace=True)
train_data

In [None]:
# GETTING NULL VALUES
train_data.isnull().sum()

In [None]:
obj = train_data.select_dtypes(include='object')
obj.isna().sum()

In [None]:
# GETTING MODE VALUE OF MARITAL STATUS
x = train_data['marital'].mode()
x

In [None]:
# FILLING NULL VALUES WITH MODE IN marital
train_data['marital'].fillna(x[0], inplace=True)

In [None]:
# GETTING MODE VALUE OF personal_loan
x = train_data['personal_loan'].mode()
x

In [None]:
# FILLING NULL VALUES WITH MODE IN personal_loan
train_data['personal_loan'].fillna(x[0], inplace=True)

In [None]:
float_cols = train_data.select_dtypes(include='float64')
float_cols.isna().sum()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(28, 7), sharey=True)
sns.boxplot(train_data['customer_age'], ax=axes[0], color='r')
sns.boxplot(train_data['balance'], ax=axes[1], color='b')
sns.boxplot(train_data['num_contacts_prev_campaign'], ax=axes[2], color='y')
sns.boxplot(train_data['last_contact_duration'], ax=axes[3], color='cyan')

In [None]:
Q1 = train_data.quantile(0.10)
Q3 = train_data.quantile(0.90)
IQR = Q3 - Q1

train_data = train_data[~((train_data < (Q1 - 1.5 * IQR)) |(train_data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(28, 7), sharey=True)
sns.boxplot(train_data['customer_age'], ax=axes[0], color='r')
sns.boxplot(train_data['balance'], ax=axes[1], color='b')
sns.boxplot(train_data['num_contacts_prev_campaign'], ax=axes[2], color='y')
sns.boxplot(train_data['last_contact_duration'], ax=axes[3], color='cyan')

In [None]:
# GETTING MEDIAN AGE AS AGE IS FLOATING VALUE
x = train_data['customer_age'].median()
x

In [None]:
# FILLING NULL VALUES IN customer_age COLUMN WITH MEDIAN
train_data['customer_age'].fillna(x, inplace=True)

In [None]:
# GETTING MEDIAN BALANCE VALUE
x = train_data['balance'].median()
x

In [None]:
# FILLING NULL VALUES IN balance COLUMN WITH MEDIAN
train_data['balance'].fillna(x, inplace=True)

In [None]:
# GETTING MEDIAN VALUE OF last_call_duration COLUMN
x = train_data['last_contact_duration'].median()
x

In [None]:
# FILLING NULL VALUES IN last_contact_duration COLUMN WITH MEDIAN
train_data['last_contact_duration'].fillna(x, inplace=True)

In [None]:
# GETTING MEDIAN VALUE OF num_contact_in_campaign COLUMN
x = train_data['num_contacts_in_campaign'].median()
x

In [None]:
# FILLING NULL VALUES IN num_contacts_in_campaign COLUMN WITH MEDIAN AS MEAN VALUE CAN BE INFLUENCED BY OUTLIER, BUT MEDIAN WILL NOT
train_data['num_contacts_in_campaign'].fillna(x, inplace=True)

In [None]:
train_data.isnull().sum()

In [None]:
# create the mid_month column
train_data['mid_month'] = train_data['day_of_month'].apply(lambda x: 1 if x > 15 else 0)

# delete the day_of_month column
train_data.drop('day_of_month', axis=1, inplace=True)

train_data

In [None]:
train_data["communication_type"].value_counts()

In [None]:

# deleting the column since there is a huge amount of "cellular" (64%) and we think that it is not correlated with what we are studying
train_data.drop('communication_type', axis=1, inplace=True)

In [None]:
# GETTING THE NULL VALUES
train_data.isnull().sum()

In [None]:
obj = train_data.select_dtypes(include='object')
obj.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

obj_cols = train_data.select_dtypes(include='object').columns

In [None]:
for col in obj_cols:
    train_data[col] = le.fit_transform(train_data[col])

train_data.head(10)

In [None]:
train_data.to_csv('train-clean.csv', index=False)
train_data.to_excel('train-clean.xlsx', index=False)

In [None]:
# Example data
algorithms = ['Neural Networks', 'Decision Tree', 'KNN']
accuracy_scores = [0.9164726824592633, 0.893057647542106, 0.8940161577433932]
precision_scores = [0.82, 0.88, 0.75]

# Create the graph
plt.figure(figsize=(8, 6))
plt.bar(algorithms, accuracy_scores, label='Accuracy')
plt.plot(algorithms, precision_scores, marker='o', label='Precision')

# Customize the graph
plt.xlabel('Algorithms')
plt.ylabel('Scores')
plt.title('Performance Comparison of Different Algorithms')
plt.legend()

# Display or save the graph
plt.show()