# Data cleansing and transformation done on the data.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# First of all, we import all the packages that are required in this porject. 
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from scipy import stats
import datetime

In [2]:
data = pd.read_csv("bank-full.csv")

In [17]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# I - Missing Values

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [3]:
data["age"] = data["age"].transform(lambda x: x.fillna(x.mean()))
data["balance"] = data["balance"].transform(lambda x: x.fillna(x.mean()))
data["day"] = data["day"].transform(lambda x: x.fillna(x.mean()))
data["duration"] = data["duration"].transform(lambda x: x.fillna(x.mean()))
data["campaign"] = data["campaign"].transform(lambda x: x.fillna(x.mean()))
data["pdays"] = data["pdays"].transform(lambda x: x.fillna(x.mean()))
data["previous"] = data["previous"].transform(lambda x: x.fillna(x.mean()))

In [4]:
data["job"] = data["job"].transform(lambda x: x.fillna(x.mode()[0]))
data["marital"] = data["marital"].transform(lambda x: x.fillna(x.mode()[0]))
data["education"] = data["education"].transform(lambda x: x.fillna(x.mode()[0]))
data["default"] = data["default"].transform(lambda x: x.fillna(x.mode()[0]))
data["housing"] = data["housing"].transform(lambda x: x.fillna(x.mode()[0]))
data["loan"] = data["loan"].transform(lambda x: x.fillna(x.mode()[0]))
data["contact"] = data["contact"].transform(lambda x: x.fillna(x.mode()[0]))
data["month"] = data["month"].transform(lambda x: x.fillna(x.mode()[0]))
data["poutcome"] = data["poutcome"].transform(lambda x: x.fillna(x.mode()[0]))

In [21]:
data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# II - Outliers

# Identifying Outliers:

## 1 - IQR Method

In [3]:
Q1 = data.quantile(0.25)  # To find the 25th percentile and 75th percentile.
Q3 = data.quantile(0.75)

IQR = Q3 - Q1  # Inter Quantile Range (75th perentile - 25th percentile)

lower = (Q1 - 1.5 * IQR)  # Finding lower and upper bounds for all values. All values outside these bounds are outliers
upper = (Q3 + 1.5 * IQR)

In [6]:
 outliers = (
    (data.select_dtypes(include=["float64", "int64"]) < lower)
    | (data.select_dtypes(include=["float64", "int64"]) > upper)
).sum() / len(data) * 100

outliers

age          1.077171
balance     10.459844
day          0.000000
duration     7.155338
campaign     6.777112
pdays       18.263255
previous    18.263255
dtype: float64

## 2 - Z-Score

In [33]:
threshold=2
z_scores = np.abs(stats.zscore(data.select_dtypes(include=["float64", "int64"])))
outliers = np.where(z_scores > threshold)
outliers[0]

array([   34,    37,    43, ..., 45208, 45208, 45210], dtype=int64)

# Handling Outliers:

## 1 - Removing Outliers:

In [9]:
def find_outliers_iqr(df):
    # Calculate the first quartile (Q1)
    q1 = np.percentile(df, 25)
    
    # Calculate the third quartile (Q3)
    q3 = np.percentile(df, 75)
    
    # Calculate the IQR (Interquartile Range)
    iqr = q3 - q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    # Identify outliers
    outliers = [value for value in df if value < lower_bound or value > upper_bound]
    
    return outliers

outliers = find_outliers_iqr(data['age'])


print("Outliers:", outliers)


Outliers: [83, 75, 75, 83, 75, 72, 71, 76, 83, 71, 85, 72, 90, 71, 82, 73, 74, 71, 75, 78, 85, 80, 71, 78, 73, 75, 94, 72, 83, 79, 78, 83, 73, 76, 83, 77, 73, 74, 73, 77, 71, 71, 80, 82, 75, 77, 78, 78, 86, 76, 72, 74, 77, 74, 79, 71, 74, 78, 73, 73, 95, 74, 71, 78, 71, 75, 75, 82, 74, 81, 76, 72, 75, 72, 71, 72, 77, 73, 85, 83, 78, 73, 79, 71, 71, 80, 72, 78, 79, 71, 82, 81, 79, 71, 79, 79, 73, 75, 72, 71, 75, 78, 75, 80, 80, 74, 73, 71, 72, 72, 77, 79, 73, 71, 80, 71, 83, 71, 73, 78, 72, 81, 81, 71, 74, 75, 82, 80, 76, 74, 74, 77, 71, 71, 77, 76, 72, 74, 74, 72, 73, 82, 77, 71, 79, 89, 76, 81, 74, 73, 71, 71, 77, 72, 84, 86, 72, 72, 76, 73, 74, 82, 71, 83, 72, 73, 73, 74, 72, 78, 86, 76, 79, 71, 81, 77, 82, 72, 73, 73, 74, 71, 73, 76, 71, 71, 83, 71, 80, 79, 77, 74, 84, 95, 77, 78, 79, 83, 83, 73, 77, 72, 74, 83, 76, 73, 81, 80, 75, 74, 75, 77, 83, 72, 81, 73, 74, 80, 72, 76, 72, 77, 74, 87, 76, 92, 78, 82, 78, 76, 73, 77, 76, 73, 82, 81, 80, 71, 76, 74, 75, 72, 80, 84, 87, 72, 76, 7

In [10]:
def remove_outliers(data):
    outliers = find_outliers_iqr(data)
    cleaned_data = [value for value in data if value not in outliers]
    return cleaned_data


In [11]:
remove_outliers(data['age'])

[58,
 44,
 33,
 47,
 33,
 35,
 28,
 42,
 58,
 43,
 41,
 29,
 53,
 58,
 57,
 51,
 45,
 57,
 60,
 33,
 28,
 56,
 32,
 25,
 40,
 44,
 39,
 52,
 46,
 36,
 57,
 49,
 60,
 59,
 51,
 57,
 25,
 53,
 36,
 37,
 44,
 50,
 60,
 54,
 58,
 36,
 58,
 44,
 55,
 29,
 54,
 48,
 32,
 42,
 24,
 38,
 38,
 47,
 40,
 46,
 32,
 53,
 57,
 33,
 49,
 51,
 60,
 59,
 55,
 35,
 57,
 31,
 54,
 55,
 43,
 53,
 44,
 55,
 49,
 55,
 45,
 47,
 42,
 59,
 46,
 51,
 56,
 41,
 46,
 57,
 42,
 30,
 60,
 60,
 57,
 36,
 55,
 60,
 39,
 46,
 44,
 53,
 52,
 59,
 27,
 44,
 47,
 34,
 59,
 45,
 29,
 46,
 56,
 36,
 59,
 44,
 41,
 33,
 59,
 57,
 56,
 51,
 34,
 43,
 52,
 33,
 29,
 34,
 31,
 55,
 55,
 32,
 38,
 55,
 28,
 23,
 32,
 43,
 32,
 46,
 53,
 34,
 57,
 37,
 59,
 33,
 56,
 48,
 43,
 54,
 51,
 26,
 40,
 39,
 50,
 41,
 51,
 60,
 52,
 48,
 48,
 39,
 47,
 40,
 45,
 26,
 52,
 54,
 54,
 50,
 35,
 44,
 53,
 35,
 60,
 53,
 48,
 34,
 54,
 51,
 31,
 35,
 35,
 38,
 36,
 58,
 40,
 54,
 34,
 31,
 51,
 33,
 55,
 42,
 34,
 33,
 38,
 50,
 43,
 61,


In [12]:
remove_outliers(data['balance'])

[2143,
 29,
 2,
 1506,
 1,
 231,
 447,
 2,
 121,
 593,
 270,
 390,
 6,
 71,
 162,
 229,
 13,
 52,
 60,
 0,
 723,
 779,
 23,
 50,
 0,
 -372,
 255,
 113,
 -246,
 265,
 839,
 378,
 39,
 0,
 63,
 -7,
 -3,
 506,
 0,
 2586,
 49,
 104,
 529,
 96,
 -171,
 -364,
 0,
 0,
 0,
 1291,
 -244,
 0,
 -76,
 -103,
 243,
 424,
 306,
 24,
 179,
 0,
 989,
 249,
 790,
 154,
 100,
 59,
 1205,
 25,
 282,
 23,
 1937,
 384,
 582,
 91,
 0,
 1,
 206,
 164,
 690,
 2343,
 137,
 173,
 45,
 1270,
 16,
 486,
 50,
 152,
 290,
 54,
 -37,
 101,
 383,
 81,
 0,
 229,
 -674,
 90,
 128,
 179,
 0,
 54,
 151,
 61,
 30,
 523,
 31,
 79,
 -34,
 448,
 81,
 144,
 351,
 -67,
 262,
 0,
 56,
 26,
 3,
 41,
 7,
 105,
 818,
 -16,
 0,
 2476,
 1185,
 217,
 1685,
 802,
 0,
 94,
 0,
 0,
 517,
 265,
 947,
 3,
 42,
 37,
 57,
 22,
 8,
 293,
 3,
 348,
 -19,
 0,
 -4,
 18,
 139,
 0,
 1883,
 216,
 782,
 904,
 1705,
 47,
 176,
 1225,
 86,
 82,
 271,
 1378,
 184,
 0,
 0,
 1357,
 19,
 434,
 92,
 1151,
 41,
 51,
 214,
 1161,
 37,
 787,
 59,
 253,
 211,


In [13]:
remove_outliers(data['day'])

[5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,


In [14]:
remove_outliers(data['duration'])

[261,
 151,
 76,
 92,
 198,
 139,
 217,
 380,
 50,
 55,
 222,
 137,
 517,
 71,
 174,
 353,
 98,
 38,
 219,
 54,
 262,
 164,
 160,
 342,
 181,
 172,
 296,
 127,
 255,
 348,
 225,
 230,
 208,
 226,
 336,
 242,
 365,
 577,
 137,
 160,
 180,
 22,
 616,
 242,
 355,
 225,
 160,
 363,
 266,
 253,
 179,
 145,
 174,
 104,
 13,
 185,
 138,
 164,
 391,
 357,
 91,
 528,
 273,
 158,
 177,
 258,
 172,
 154,
 291,
 181,
 176,
 211,
 349,
 272,
 208,
 193,
 212,
 20,
 246,
 529,
 188,
 180,
 48,
 213,
 583,
 221,
 173,
 426,
 287,
 101,
 203,
 197,
 257,
 124,
 229,
 55,
 400,
 197,
 190,
 21,
 514,
 194,
 144,
 212,
 286,
 107,
 247,
 518,
 364,
 178,
 98,
 439,
 79,
 120,
 127,
 175,
 262,
 61,
 78,
 143,
 579,
 345,
 185,
 100,
 125,
 193,
 136,
 73,
 528,
 541,
 163,
 301,
 46,
 204,
 98,
 71,
 157,
 243,
 186,
 579,
 163,
 610,
 85,
 114,
 114,
 57,
 238,
 93,
 128,
 107,
 181,
 303,
 558,
 270,
 228,
 99,
 240,
 233,
 250,
 252,
 138,
 130,
 412,
 179,
 19,
 458,
 313,
 416,
 146,
 167,
 315,
 1

In [15]:
remove_outliers(data['campaign'])

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 3,
 2,
 1,
 1,
 2,
 5,
 1,
 1,
 1,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 3,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 3,
 1,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 2,
 2,
 1,
 1,
 2,


In [16]:
remove_outliers(data['pdays'])

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,


In [17]:
remove_outliers(data['previous'])

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


## 2 - Transformations

In [26]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[0]]])
transformed_age

Unnamed: 0,age
0,4.077537
1,3.806662
2,3.526361
3,3.871201
4,3.526361
...,...
45206,3.951244
45207,4.276666
45208,4.290459
45209,4.060443


In [27]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[1]]])
transformed_age

Unnamed: 0,balance
0,7.670429
1,3.401197
2,1.098612
3,7.317876
4,0.693147
...,...
45206,6.716595
45207,7.455877
45208,8.651025
45209,6.505784


In [28]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[2]]])
transformed_age

Unnamed: 0,day
0,1.791759
1,1.791759
2,1.791759
3,1.791759
4,1.791759
...,...
45206,2.890372
45207,2.890372
45208,2.890372
45209,2.890372


In [29]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[3]]])
transformed_age

Unnamed: 0,duration
0,5.568345
1,5.023881
2,4.343805
3,4.532599
4,5.293305
...,...
45206,6.885510
45207,6.124683
45208,7.028201
45209,6.232448


In [30]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[4]]])
transformed_age

Unnamed: 0,campaign
0,0.693147
1,0.693147
2,0.693147
3,0.693147
4,0.693147
...,...
45206,1.386294
45207,1.098612
45208,1.791759
45209,1.609438


In [31]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[5]]])
transformed_age

Unnamed: 0,pdays
0,-inf
1,-inf
2,-inf
3,-inf
4,-inf
...,...
45206,-inf
45207,-inf
45208,5.220356
45209,-inf


In [32]:
transformed_age = np.log1p(data[[data.select_dtypes(include=["float64", "int64"]).columns[6]]])
transformed_age

Unnamed: 0,previous
0,0.000000
1,0.000000
2,0.000000
3,0.000000
4,0.000000
...,...
45206,0.000000
45207,0.000000
45208,1.386294
45209,0.000000


## 3 - Imputation

In [10]:
def impute_outliers_with_median(data, threshold=3):
    z_scores = np.abs(stats.zscore(data))
    outliers_indices = np.where(z_scores > threshold)
    
    # Impute outliers with the median value
    data_copy = np.copy(data)
    median_value = np.median(data)
    data_copy[outliers_indices] = median_value
    
    return data_copy

In [11]:
outliers

age          1.077171
balance     10.459844
day          0.000000
duration     7.155338
campaign     6.777112
pdays       18.263255
previous    18.263255
dtype: float64

In [13]:
data['age'] = impute_outliers_with_median(data['age'], threshold=2)
data['balance'] = impute_outliers_with_median(data['balance'], threshold=2)
data['day'] = impute_outliers_with_median(data['day'], threshold=2)
data['duration'] = impute_outliers_with_median(data['duration'], threshold=2)
data['campaign'] = impute_outliers_with_median(data['campaign'], threshold=2)
data['pdays'] = impute_outliers_with_median(data['pdays'], threshold=2)
data['previous'] = impute_outliers_with_median(data['previous'], threshold=2)