In [205]:
# Loading the basic Python libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")

#### Pasting screenshots of the file

![1.JPG](attachment:1.JPG)

In [206]:
# Creating a function to load the files into Pandas df

import pandas as pd

def load_csv(file_path):
    '''This function reads and return a .csv file'''
    return pd.read_csv(file_path)

# Saving the function as data_loader.py file

In [207]:
# Creating dfs using function

geo = load_csv(r"C:\Users\user\Desktop\Git Projects\Banking-Domain\data\raw\Geo_scores.csv")
instance = load_csv(r"C:\Users\user\Desktop\Git Projects\Banking-Domain\data\raw\instance_scores.csv")
lambdawts = load_csv(r"C:\Users\user\Desktop\Git Projects\Banking-Domain\data\raw\Lambda_wts.csv")
qset = load_csv(r"C:\Users\user\Desktop\Git Projects\Banking-Domain\data\raw\Qset_tats.csv")
test_data = load_csv(r"C:\Users\user\Desktop\Git Projects\Banking-Domain\data\raw\test_share.csv")
train_data = load_csv(r"C:\Users\user\Desktop\Git Projects\Banking-Domain\data\raw\train.csv")

In [208]:
# Creating a dictionary of Pandas df and giving names

combined_data = {"geo" : geo, "instance" : instance, "lambdawts" : lambdawts, "qset" : qset, "test_data" : test_data, "train_data" : train_data }

In [209]:
# Creating a function to get value out of df/series using functions/methods/sttributes

def apply_df_method(df, method_name, *args, **kwargs):
    """
    Apply a method of a DataFrame dynamically.

    Parameters:
        df (pd.DataFrame): The DataFrame to operate on.
        method_name (str): The name of the DataFrame method (e.g., "mean", "shape").
        *args: Positional arguments to pass to the method.
        **kwargs: Keyword arguments to pass to the method.

    Returns:
        Result of the method call.
    """
    # Get the method
    if hasattr(df, method_name):
        method = getattr(df, method_name)
        # If it's callable (like df.mean()), call it
        if callable(method):
            return method(*args, **kwargs)
        else:
            return method  # For properties like df.shape or df.columns
    else:
        raise AttributeError(f"'{type(df).__name__}' object has no method or attribute '{method_name}'")

# Saving the function as informer.py

In [210]:
# Checking the shape of the data

for key, value in combined_data.items():
    print(f"The shape of {key} file : ", apply_df_method(value, 'shape'))
    print()
    
# Outcome :
# lamdbawts, test_data and train_data have different number of rows

The shape of geo file :  (1424035, 2)

The shape of instance file :  (1424035, 2)

The shape of lambdawts file :  (1400, 2)

The shape of qset file :  (1424035, 2)

The shape of test_data file :  (56962, 27)

The shape of train_data file :  (227845, 28)



In [211]:
# Checking the columns of the data sets

for key, value in combined_data.items():
    print(f"The columns of {key} file: ", apply_df_method(value, 'columns').tolist())
    print()

# Outcome :
# Only 'id' column-> geo, instance and qset
# Only 'Group' column-> lambdawts
# Both 'id' and 'Group' columns-> test_data and train_data

The columns of geo file:  ['id', 'geo_score']

The columns of instance file:  ['id', 'instance_scores']

The columns of lambdawts file:  ['Group', 'lambda_wt']

The columns of qset file:  ['id', 'qsets_normalized_tat']

The columns of test_data file:  ['id', 'Group', 'Per1', 'Per2', 'Per3', 'Per4', 'Per5', 'Per6', 'Per7', 'Per8', 'Per9', 'Dem1', 'Dem2', 'Dem3', 'Dem4', 'Dem5', 'Dem6', 'Dem7', 'Dem8', 'Dem9', 'Cred1', 'Cred2', 'Cred3', 'Cred4', 'Cred5', 'Cred6', 'Normalised_FNT']

The columns of train_data file:  ['id', 'Group', 'Per1', 'Per2', 'Per3', 'Per4', 'Per5', 'Per6', 'Per7', 'Per8', 'Per9', 'Dem1', 'Dem2', 'Dem3', 'Dem4', 'Dem5', 'Dem6', 'Dem7', 'Dem8', 'Dem9', 'Cred1', 'Cred2', 'Cred3', 'Cred4', 'Cred5', 'Cred6', 'Normalised_FNT', 'Target']



In [None]:
# Checking the number of unique values in a particular column(id/group)

for key, value in combined_data.items():
    if 'id' in value.columns:
        print(f"Number of unique values in 'id' column of {key} file: ", apply_df_method(value['id'], 'nunique'))
        print()
    if 'Group' in value.columns:
        print(f"Number of unique values in 'Group' column of {key} file: ", apply_df_method(value['Group'], 'nunique'))
        print()

# Outcome:
#  test_data and train_data has fewer number of unique 'ids' than rest of the data files 
# Moreover, they have fewer number of unique 'Groups' than lambdawts file

#Action to take:
# To combine test_data and train_data to check if they get align

Number of unique values in 'id' column of geo file:  284807

Number of unique values in 'id' column of instance file:  284807

Number of unique values in 'Group' column of lambdawts file:  1400

Number of unique values in 'id' column of qset file:  284807

Number of unique values in 'id' column of test_data file:  56962

Number of unique values in 'Group' column of test_data file:  915

Number of unique values in 'id' column of train_data file:  227845

Number of unique values in 'Group' column of train_data file:  1301



In [213]:
# Adding a mark of train and test to both of their respective files before merging data

train_data['data'] = 'train'
test_data['data'] = 'test'

In [None]:
# Checking columns of train_data after marking
train_data.columns

Index(['id', 'Group', 'Per1', 'Per2', 'Per3', 'Per4', 'Per5', 'Per6', 'Per7',
       'Per8', 'Per9', 'Dem1', 'Dem2', 'Dem3', 'Dem4', 'Dem5', 'Dem6', 'Dem7',
       'Dem8', 'Dem9', 'Cred1', 'Cred2', 'Cred3', 'Cred4', 'Cred5', 'Cred6',
       'Normalised_FNT', 'Target', 'data'],
      dtype='object')

In [None]:
# Chacking columns of test_data after marking

test_data.columns

#Outcome:
# train_data has 1 extra column named as 'Target' than test_data
#Which is fine as that is our dependent variable

Index(['id', 'Group', 'Per1', 'Per2', 'Per3', 'Per4', 'Per5', 'Per6', 'Per7',
       'Per8', 'Per9', 'Dem1', 'Dem2', 'Dem3', 'Dem4', 'Dem5', 'Dem6', 'Dem7',
       'Dem8', 'Dem9', 'Cred1', 'Cred2', 'Cred3', 'Cred4', 'Cred5', 'Cred6',
       'Normalised_FNT', 'data'],
      dtype='object')

In [214]:
# Checking the last 5 values of newly added column(data) in train_data

train_data.tail()

Unnamed: 0,id,Group,Per1,Per2,Per3,Per4,Per5,Per6,Per7,Per8,...,Dem9,Cred1,Cred2,Cred3,Cred4,Cred5,Cred6,Normalised_FNT,Target,data
227840,97346,Grp232,0.476667,1.013333,0.536667,0.576667,1.406667,1.846667,0.6,1.103333,...,0.63,0.633333,0.996667,0.646667,0.533333,0.68,0.693333,-246.5025,0,train
227841,147361,Grp199,1.363333,0.73,0.06,0.776667,0.883333,0.466667,0.733333,0.59,...,0.356667,0.766667,0.73,0.596667,0.73,0.646667,0.656667,-249.7775,0,train
227842,50989,Grp36,1.06,0.756667,0.906667,0.896667,0.503333,0.396667,0.683333,0.62,...,0.51,0.74,0.873333,0.7,0.696667,0.663333,0.673333,-249.7775,0,train
227843,149780,Grp445,0.433333,1.013333,1.163333,0.94,0.93,0.9,0.813333,0.72,...,0.606667,0.54,0.643333,0.906667,0.54,0.766667,0.71,-242.75,0,train
227844,22175,Grp143,1.006667,0.553333,0.946667,1.206667,0.406667,0.75,0.52,0.756667,...,0.646667,0.636667,0.683333,0.843333,0.58,0.683333,0.676667,-235.0,0,train


In [216]:
# Checking the last 5 values of newly added column(data) in test_data

test_data.tail()

Unnamed: 0,id,Group,Per1,Per2,Per3,Per4,Per5,Per6,Per7,Per8,...,Dem8,Dem9,Cred1,Cred2,Cred3,Cred4,Cred5,Cred6,Normalised_FNT,data
56957,18333,Grp102,0.553333,1.043333,1.096667,0.686667,0.673333,0.34,0.9,0.643333,...,0.576667,0.433333,0.66,0.776667,0.61,0.69,0.75,0.7,-249.505,test
56958,244207,Grp504,1.353333,0.616667,0.276667,0.783333,0.69,0.65,0.473333,0.67,...,0.713333,0.87,0.683333,0.69,0.64,0.883333,0.663333,0.66,-248.7525,test
56959,103277,Grp78,1.083333,0.433333,0.806667,0.49,0.243333,0.316667,0.533333,0.606667,...,0.433333,0.063333,0.753333,0.78,0.603333,0.88,0.643333,0.676667,-231.05,test
56960,273294,Grp134,0.566667,1.153333,0.37,0.616667,0.793333,0.226667,0.91,0.696667,...,0.776667,1.026667,0.626667,0.646667,0.566667,0.616667,0.713333,0.706667,-246.315,test
56961,223337,Grp18,1.426667,0.11,-0.006667,-0.2,0.983333,1.87,0.033333,0.963333,...,0.616667,0.67,0.77,0.893333,0.586667,0.616667,0.683333,0.65,-248.45,test


In [None]:
# Merging all the data row wise

all_data = pd.concat([train_data, test_data], axis=0)

In [218]:
# Checking top 5 rows after merging

all_data.head()

Unnamed: 0,id,Group,Per1,Per2,Per3,Per4,Per5,Per6,Per7,Per8,...,Dem9,Cred1,Cred2,Cred3,Cred4,Cred5,Cred6,Normalised_FNT,Target,data
0,112751,Grp169,1.07,0.58,0.48,0.766667,1.233333,1.993333,0.34,1.01,...,0.726667,0.606667,1.01,0.933333,0.603333,0.686667,0.673333,-245.75,0.0,train
1,18495,Grp161,0.473333,1.206667,0.883333,1.43,0.726667,0.626667,0.81,0.783333,...,0.743333,0.68,0.69,0.56,0.67,0.553333,0.653333,-248.0,0.0,train
2,23915,Grp261,1.13,0.143333,0.946667,0.123333,0.08,0.836667,0.056667,0.756667,...,0.82,0.6,0.383333,0.763333,0.67,0.686667,0.673333,-233.125,0.0,train
3,50806,Grp198,0.636667,1.09,0.75,0.94,0.743333,0.346667,0.956667,0.633333,...,0.9,0.68,0.846667,0.423333,0.52,0.846667,0.76,-249.7775,0.0,train
4,184244,Grp228,0.56,1.013333,0.593333,0.416667,0.773333,0.46,0.853333,0.796667,...,0.486667,0.693333,0.526667,0.52,0.716667,0.706667,0.673333,-247.5775,0.0,train


In [None]:
# Checking last 5 rows after merging

all_data.tail()

# Outcome:
# Merging seems good as checked by values in 'data' column
# 'Target' column has null values which is expected as there was no 'Target' column in test_data

Unnamed: 0,id,Group,Per1,Per2,Per3,Per4,Per5,Per6,Per7,Per8,...,Dem9,Cred1,Cred2,Cred3,Cred4,Cred5,Cred6,Normalised_FNT,Target,data
56957,18333,Grp102,0.553333,1.043333,1.096667,0.686667,0.673333,0.34,0.9,0.643333,...,0.433333,0.66,0.776667,0.61,0.69,0.75,0.7,-249.505,,test
56958,244207,Grp504,1.353333,0.616667,0.276667,0.783333,0.69,0.65,0.473333,0.67,...,0.87,0.683333,0.69,0.64,0.883333,0.663333,0.66,-248.7525,,test
56959,103277,Grp78,1.083333,0.433333,0.806667,0.49,0.243333,0.316667,0.533333,0.606667,...,0.063333,0.753333,0.78,0.603333,0.88,0.643333,0.676667,-231.05,,test
56960,273294,Grp134,0.566667,1.153333,0.37,0.616667,0.793333,0.226667,0.91,0.696667,...,1.026667,0.626667,0.646667,0.566667,0.616667,0.713333,0.706667,-246.315,,test
56961,223337,Grp18,1.426667,0.11,-0.006667,-0.2,0.983333,1.87,0.033333,0.963333,...,0.67,0.77,0.893333,0.586667,0.616667,0.683333,0.65,-248.45,,test


In [None]:
# Checking columns after merging

all_data.columns

Index(['id', 'Group', 'Per1', 'Per2', 'Per3', 'Per4', 'Per5', 'Per6', 'Per7',
       'Per8', 'Per9', 'Dem1', 'Dem2', 'Dem3', 'Dem4', 'Dem5', 'Dem6', 'Dem7',
       'Dem8', 'Dem9', 'Cred1', 'Cred2', 'Cred3', 'Cred4', 'Cred5', 'Cred6',
       'Normalised_FNT', 'Target', 'data'],
      dtype='object')

In [222]:
# Checking unique values of 'id' and 'Group' columns in merged data

print(f"Number of unique values in 'id' column of all_data file: ", apply_df_method(all_data['id'], 'nunique'))
print()
print(f"Number of unique values in 'Group' column of all_data file: ", apply_df_method(all_data['Group'], 'nunique'))

# Outcome :
# Merged data seems consistent with other data files

Number of unique values in 'id' column of all_data file:  284807

Number of unique values in 'Group' column of all_data file:  1400


In [231]:
# From now on we will be working with the merged data and other data. 
# Hence removing the train_data and test_data from combined_data dictionary.

# Called it twice to remove 2 last inserted items.
# Commenting it out to not to make accidental run
#combined_data.popitem()

In [239]:
# Adding all_data to the dictionary

combined_data['all_data'] = all_data

In [None]:
# Checking the dictionary

combined_data['all_data'].tail()

dict_keys(['geo', 'instance', 'lambdawts', 'qset', 'all_data'])

In [249]:
#Checking the null values

for key, value in combined_data.items():
    print(f"The null values in {key} file: ")
    print(apply_df_method(value, 'isnull').sum())
    print("----------------------------------")


The null values in geo file: 
id               0
geo_score    71543
dtype: int64
----------------------------------
The null values in instance file: 
id                 0
instance_scores    0
dtype: int64
----------------------------------
The null values in lambdawts file: 
Group        0
lambda_wt    0
dtype: int64
----------------------------------
The null values in qset file: 
id                           0
qsets_normalized_tat    103201
dtype: int64
----------------------------------
The null values in all_data file: 
id                    0
Group                 0
Per1                  0
Per2                  0
Per3                  0
Per4                  0
Per5                  0
Per6                  0
Per7                  0
Per8                  0
Per9                  0
Dem1                  0
Dem2                  0
Dem3                  0
Dem4                  0
Dem5                  0
Dem6                  0
Dem7                  0
Dem8                  0
Dem9         

In [122]:
# Describing the data

print(geo.describe())
print()
print(qset.describe())


                 id     geo_score
count  1.424035e+06  1.352492e+06
mean   1.424030e+05 -9.279168e-06
std    8.221673e+04  7.827199e+00
min    0.000000e+00 -1.093900e+02
25%    7.120100e+04 -5.860000e+00
50%    1.424030e+05  1.800000e-01
75%    2.136050e+05  5.860000e+00
max    2.848060e+05  4.581000e+01

                 id  qsets_normalized_tat
count  1.424035e+06          1.320834e+06
mean   1.424030e+05          1.094006e-05
std    8.221673e+04          7.731794e+00
min    0.000000e+00         -1.404400e+02
25%    7.120100e+04         -5.860000e+00
50%    1.424030e+05          2.000000e-02
75%    2.136050e+05          5.860000e+00
max    2.848060e+05          6.110000e+01


In [123]:
# Imputing median values to the null values

geo["geo_score"] = geo["geo_score"].fillna(geo["geo_score"].median())
qset["qsets_normalized_tat"] = qset["qsets_normalized_tat"].fillna(qset["qsets_normalized_tat"].median())

In [124]:
#Checking the null values
print(geo.isnull().sum())
print()
print(instance.isnull().sum())
print()
print(lambdawts.isnull().sum())
print()
print(qset.isnull().sum())
print()
print(all_data.isnull().sum())

id           0
geo_score    0
dtype: int64

id                 0
instance_scores    0
dtype: int64

Group        0
lambda_wt    0
dtype: int64

id                      0
qsets_normalized_tat    0
dtype: int64

id                    0
Group                 0
Per1                  0
Per2                  0
Per3                  0
Per4                  0
Per5                  0
Per6                  0
Per7                  0
Per8                  0
Per9                  0
Dem1                  0
Dem2                  0
Dem3                  0
Dem4                  0
Dem5                  0
Dem6                  0
Dem7                  0
Dem8                  0
Dem9                  0
Cred1                 0
Cred2                 0
Cred3                 0
Cred4                 0
Cred5                 0
Cred6                 0
Normalised_FNT        0
Target            56962
data                  0
dtype: int64


In [125]:
geo.shape

(1424035, 2)

In [126]:
geo["id"].nunique()

284807

In [127]:
geo = geo.groupby('id').mean()

In [128]:
geo.shape

(284807, 1)

In [129]:
geo

Unnamed: 0_level_0,geo_score
id,Unnamed: 1_level_1
0,-0.620
1,1.106
2,0.070
3,0.180
4,0.540
...,...
284802,2.710
284803,0.956
284804,0.060
284805,-0.960


In [130]:
qset = qset.groupby('id').mean()

In [131]:
qset.shape

(284807, 1)

In [132]:
instance.shape

(1424035, 2)

In [133]:
instance = instance.groupby('id').mean()

In [134]:
instance.shape

(284807, 1)

In [135]:
lambdawts.shape

(1400, 2)