In [6]:
import pandas as pd
 # Sample banking data with missing values
data = {
"Customer ID": [101, 102, 103, 104, 105],
"Name": ["Alice", "Bob", "Charlie", None, "Eve"],
"Account Balance": [3000, None, 4000, 500, 1000],
"Account Type": ["Savings", "Checking", None, "Checking",
"Savings"]
 }
 # Creating the DataFrame
bank_df = pd.DataFrame(data)
# Checking for missing values
print(bank_df.isnull().sum())

Customer ID        0
Name               1
Account Balance    1
Account Type       1
dtype: int64


In [7]:
# Dropping rows with any missing values
bank_df_dropped = bank_df.dropna()
print(bank_df_dropped)

   Customer ID   Name  Account Balance Account Type
0          101  Alice           3000.0      Savings
4          105    Eve           1000.0      Savings


In [8]:
# Filling missing account balances with 0
bank_df["Account Balance"].fillna(0, inplace=True)
# Filling missing names with "Unknown"
bank_df["Name"].fillna("Unknown", inplace=True)
print(bank_df)

   Customer ID     Name  Account Balance Account Type
0          101    Alice           3000.0      Savings
1          102      Bob              0.0     Checking
2          103  Charlie           4000.0         None
3          104  Unknown            500.0     Checking
4          105      Eve           1000.0      Savings


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bank_df["Account Balance"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bank_df["Name"].fillna("Unknown", inplace=True)


In [9]:
 # Forward fill for Account Type
bank_df["Account Type"].fillna(method="ffill", inplace=True)
print(bank_df)

   Customer ID     Name  Account Balance Account Type
0          101    Alice           3000.0      Savings
1          102      Bob              0.0     Checking
2          103  Charlie           4000.0     Checking
3          104  Unknown            500.0     Checking
4          105      Eve           1000.0      Savings


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  bank_df["Account Type"].fillna(method="ffill", inplace=True)
  bank_df["Account Type"].fillna(method="ffill", inplace=True)


Data Transformation

In [10]:
 # Renaming Customer ID to ID
bank_df.rename(columns={"Customer ID": "ID"}, inplace=True)
print(bank_df)

    ID     Name  Account Balance Account Type
0  101    Alice           3000.0      Savings
1  102      Bob              0.0     Checking
2  103  Charlie           4000.0     Checking
3  104  Unknown            500.0     Checking
4  105      Eve           1000.0      Savings


In [11]:
 # Converting Account Balance to integer
bank_df["Account Balance"]= bank_df["Account Balance"].astype(int)
print(bank_df)

    ID     Name  Account Balance Account Type
0  101    Alice             3000      Savings
1  102      Bob                0     Checking
2  103  Charlie             4000     Checking
3  104  Unknown              500     Checking
4  105      Eve             1000      Savings


In [12]:
 # Adding Interest Earned column assuming a 5% interest rate
bank_df["Interest Earned"] = bank_df["Account Balance"] * 0.05
print(bank_df)

    ID     Name  Account Balance Account Type  Interest Earned
0  101    Alice             3000      Savings            150.0
1  102      Bob                0     Checking              0.0
2  103  Charlie             4000     Checking            200.0
3  104  Unknown              500     Checking             25.0
4  105      Eve             1000      Savings             50.0


In [13]:
# Adding Interest Earned column assuming a 5% interest rate
bank_df["Interest Earned"] = bank_df["Account Balance"] * 0.05
print(bank_df)

    ID     Name  Account Balance Account Type  Interest Earned
0  101    Alice             3000      Savings            150.0
1  102      Bob                0     Checking              0.0
2  103  Charlie             4000     Checking            200.0
3  104  Unknown              500     Checking             25.0
4  105      Eve             1000      Savings             50.0


In [14]:
 # Binning account balances
bins = [0, 1000, 3000, 5000]
labels = ["Low", "Medium", "High"]
bank_df["Balance Category"] = pd.cut(bank_df["Account Balance"],
bins=bins, labels=labels)
print(bank_df)


    ID     Name  Account Balance Account Type  Interest Earned  \
0  101    Alice             3000      Savings            150.0   
1  102      Bob                0     Checking              0.0   
2  103  Charlie             4000     Checking            200.0   
3  104  Unknown              500     Checking             25.0   
4  105      Eve             1000      Savings             50.0   

  Balance Category  
0           Medium  
1              NaN  
2             High  
3              Low  
4              Low  


In [16]:
 # Normalizing Account Balance (0-1 range)
bank_df["Normalized Balance"]= (bank_df["Account Balance"]-bank_df["Account Balance"].min())/(bank_df["Account Balance"].max()- bank_df["Account Balance"].min())
print(bank_df)

    ID     Name  Account Balance Account Type  Interest Earned  \
0  101    Alice             3000      Savings            150.0   
1  102      Bob                0     Checking              0.0   
2  103  Charlie             4000     Checking            200.0   
3  104  Unknown              500     Checking             25.0   
4  105      Eve             1000      Savings             50.0   

  Balance Category  Normalized Balance  
0           Medium               0.750  
1              NaN               0.000  
2             High               1.000  
3              Low               0.125  
4              Low               0.250  


Customer Data Cleaning and Preparation

In [17]:
import kagglehub
path=kagglehub.dataset_download("henriqueyamahata/bank-marketing"
 )
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/henriqueyamahata/bank-marketing?dataset_version_number=1...


100%|██████████| 393k/393k [00:01<00:00, 290kB/s]

Extracting files...
Path to dataset files: C:\Users\Hp\.cache\kagglehub\datasets\henriqueyamahata\bank-marketing\versions\1





In [22]:
import pandas as pd

data=pd.read_csv(path+"/bank-additional-full.csv",sep=";")
df=pd.DataFrame(data)
print(df.head)
print(df.isnull().sum())

<bound method NDFrame.head of        age          job  marital            education  default housing loan  \
0       56    housemaid  married             basic.4y       no      no   no   
1       57     services  married          high.school  unknown      no   no   
2       37     services  married          high.school       no     yes   no   
3       40       admin.  married             basic.6y       no      no   no   
4       56     services  married          high.school       no      no  yes   
...    ...          ...      ...                  ...      ...     ...  ...   
41183   73      retired  married  professional.course       no     yes   no   
41184   46  blue-collar  married  professional.course       no      no   no   
41185   56      retired  married    university.degree       no     yes   no   
41186   44   technician  married  professional.course       no      no   no   
41187   74      retired  married  professional.course       no     yes   no   

         contact mont

In [25]:
print(data.head())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.idx  euribor3m  nr.employed