# Getting the Credit-score data from Kaggle

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!kaggle datasets download -d parisrohan/credit-score-classification -p /content

!unzip /content/credit-score-classification.zip -d /content

train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

# Training and testing sets
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

Training data shape: (100000, 28)
Testing data shape: (50000, 27)


# Exploratory Data Analysis

Study each attribute and its characteristics:

*   Name
*   Type (categorical, int/float, bounded/unbounded, text, structured, etc.)
*   % of missing values
*   Noisiness and type of noise (stochastic, outliers, rounding errors, etc.)
*   Possibly useful for the task?
*   Type of distribution (Gaussian, uniform, logarithmic, etc)

Quick look into the columns

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

Checking for duplicates

In [7]:
train_data.duplicated().sum()

0

Quick look into first few rows

In [10]:
train_data.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,11.27,4.0,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,11.27,4.0,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,_,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4.0,6.27,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,,11.27,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


Changing the types of columns

In [11]:
num_cols = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card","Interest_Rate","Num_of_Loan", "Delay_from_due_date","Num_of_Delayed_Payment","Changed_Credit_Limit", "Num_Credit_Inquiries","Outstanding_Debt","Credit_Utilization_Ratio","Total_EMI_per_month","Amount_invested_monthly",	"Monthly_Balance","Credit_History_Age"]
cat_cols = ["Occupation","Credit_Mix", "Spending_Level","Payment_Value","Payment_of_Min_Amount"]
target_col = ["Credit_Score"]

In [13]:
class DataCleaner:
    def __init__(self, dataframe, numeric_columns):
        self.dataframe = dataframe
        self.numeric_columns = numeric_columns

    def clean_numeric_columns(self):

        for col in self.numeric_columns:

            if self.dataframe[col].dtype == 'object':
                self.dataframe[col] = self.dataframe[col].str.extract(r'(\d+)')
                self.dataframe[col] = pd.to_numeric(self.dataframe[col], errors='coerce')
                self.dataframe[col] = self.dataframe[col].fillna(self.dataframe[col].median())

            else:
                self.dataframe[col] = self.dataframe[col].fillna(self.dataframe[col].median())

        return self.dataframe



cleaner = DataCleaner(train_data, num_cols)
df = cleaner.clean_numeric_columns()

% of missing values

In [15]:
df.isnull().sum()/float(len(df))*100

Unnamed: 0,0
ID,0.0
Customer_ID,0.0
Month,0.0
Name,9.985
Age,0.0
SSN,0.0
Occupation,0.0
Annual_Income,0.0
Monthly_Inhand_Salary,0.0
Num_Bank_Accounts,0.0


Looking into Null-values in Name

In [16]:
df['Name'].nunique()

10139

In [22]:
df[df['Name'].isnull()].head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score


 - Replacing Null values in Name with the name available for same customer id in another row

In [21]:
# Create a mapping of Customer ID to Name where Name is not null
id_name_mapping = df.loc[df['Name'].notnull(), ['Customer_ID', 'Name']].set_index('Customer_ID')['Name'].to_dict()

# Function to fill missing names using the mapping
def fill_missing_name(row):
    if pd.isnull(row['Name']):
        return id_name_mapping.get(row['Customer_ID'])
    return row['Name']

# Apply the function to fill missing names
df['Name'] = df.apply(fill_missing_name, axis=1)

#Check for any remaining missing names after applying the function
print(df['Name'].isnull().sum())


0


Looking into null-values for Type of Loan

In [23]:
df['Type_of_Loan'].nunique()

6260

In [30]:
df[df['Type_of_Loan'].isnull()].head(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
32,0x1632,CUS_0x1cdb,January,Deepaa,21,615-06-7821,Developer,35547,2853.309167,7,5,5,0,,5,14.0,2.0,4.0,Standard,943,39.797764,30.0,Yes,0.0,276.0,!@9#%8,288.0,Standard
33,0x1633,CUS_0x1cdb,February,Deepaa,21,615-06-7821,Developer,35547,3093.745,7,5,5,0,,9,14.0,2.0,4.0,Standard,943,27.02036,30.0,NM,0.0,74.0,High_spent_Medium_value_payments,460.0,Standard
34,0x1634,CUS_0x1cdb,March,Deepaa,21,615-06-7821,Developer,35547,2853.309167,7,5,5,100,,5,12.0,2.0,4.0,Standard,943,23.462303,30.0,Yes,0.0,173.0,Low_spent_Medium_value_payments,392.0,Standard
35,0x1635,CUS_0x1cdb,April,Deepaa,21,615-06-7821,Developer,35547,2853.309167,7,5,5,0,,1,15.0,2.0,4.0,_,943,28.924954,30.0,Yes,0.0,96.0,High_spent_Medium_value_payments,438.0,Standard
36,0x1636,CUS_0x1cdb,May,Deepaa,21,615-06-7821,Developer,35547,2853.309167,7,5,5,0,,9,17.0,2.0,4.0,_,943,41.776187,31.0,Yes,0.0,62.0,High_spent_Small_value_payments,482.0,Standard
37,0x1637,CUS_0x1cdb,June,Deepaa,21,615-06-7821,Developer,35547,3093.745,7,5,5,0,,5,15.0,2.0,4.0,Standard,943,29.217556,31.0,Yes,0.0,37.0,High_spent_Medium_value_payments,497.0,Standard
38,0x1638,CUS_0x1cdb,July,Deepaa,21,615-06-7821,Developer,35547,2853.309167,7,5,5,0,,10,15.0,2.0,4.0,Standard,943,26.263823,31.0,Yes,0.0,181.0,Low_spent_Small_value_payments,394.0,Standard
39,0x1639,CUS_0x1cdb,August,Deepaa,21,615-06-7821,Developer,35547,2853.309167,7,5,5,100,,1,15.0,2.0,4.0,Standard,943,25.862922,31.0,Yes,0.0,181.0,High_spent_Small_value_payments,364.0,Standard
40,0x163e,CUS_0x95ee,January,Np,31,612-70-8987,Lawyer,73928,3093.745,4,1288,8,0,,12,10.0,10.0,2.0,Good,548,39.962685,18.0,No,15015.0,98.0,High_spent_Large_value_payments,740.0,Good
41,0x163f,CUS_0x95ee,February,Np,31,612-70-8987,_______,73928,5988.705,4,5,8,0,,8,7.0,10.0,2.0,Good,548,42.769864,32.0,NM,0.0,172.0,Low_spent_Medium_value_payments,705.0,Good


In [20]:
df[df['Customer_ID'] == 'CUS_0x2dbc']

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
16,0x161a,CUS_0x2dbc,January,Langep,34,486-85-3974,_______,143162,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",5,8.0,7.0,3.0,Good,1303,28.616735,17.0,No,246.992319,168.0,!@9#%8,1043.0,Good
17,0x161b,CUS_0x2dbc,February,,34,486-85-3974,Engineer,143162,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",13,6.0,7.0,3.0,Good,1303,41.702573,17.0,No,246.992319,232.0,High_spent_Small_value_payments,998.0,Good
18,0x161c,CUS_0x2dbc,March,Langep,34,486-85-3974,_______,143162,3093.745,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",8,7.0,11.0,6.0,Good,1303,26.519815,17.0,No,246.992319,10000.0,High_spent_Small_value_payments,715.0,Good
19,0x161d,CUS_0x2dbc,April,Langep,34,486-85-3974,Engineer,143162,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",8,5.0,9.0,3.0,_,1303,39.501648,18.0,No,246.992319,825.0,Low_spent_Medium_value_payments,426.0,Good
20,0x161e,CUS_0x2dbc,May,Langep,34,486-85-3974,_______,143162,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",10,5.0,7.0,3.0,Good,1303,31.37615,18.0,No,246.992319,430.0,Low_spent_Large_value_payments,810.0,Good
21,0x161f,CUS_0x2dbc,June,Langep,34,486-85-3974,Engineer,143162,12187.22,1,5,8,967,"Auto Loan, Auto Loan, and Not Specified",8,6.0,7.0,3.0,Good,1303,39.783993,18.0,No,246.992319,257.0,High_spent_Medium_value_payments,963.0,Good
22,0x1620,CUS_0x2dbc,July,,34,486-85-3974,Engineer,143162,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",8,6.0,7.0,3.0,Good,1303,38.068624,18.0,No,246.992319,263.0,High_spent_Small_value_payments,968.0,Standard
23,0x1621,CUS_0x2dbc,August,Langep,34,486-85-3974,Engineer,143162,12187.22,1,5,8,3,"Auto Loan, Auto Loan, and Not Specified",8,6.0,7.0,3.0,Good,1303,38.374753,18.0,No,246.992319,10000.0,High_spent_Small_value_payments,895.0,Standard


Display some basic statistics for Numerical Data

In [5]:
train_data.describe()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0,100000.0
mean,4194.17085,17.09128,22.47443,72.46604,21.06878,27.754251,32.285173,1403.118217
std,3183.686167,117.404834,129.05741,466.422621,14.860104,193.177339,5.116875,8306.04127
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.0,0.0
25%,1625.568229,3.0,4.0,8.0,10.0,3.0,28.052567,30.30666
50%,3093.745,6.0,5.0,13.0,18.0,6.0,32.305784,69.249473
75%,5957.448333,7.0,7.0,20.0,28.0,9.0,36.496663,161.224249
max,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,82331.0


In [None]:
# Display some basic statistics
print(train_data.describe())

# Check for missing values
print(train_data.isnull().sum())

# Check data types of each column
print(train_data.dtypes)

# Explore categorical features
for col in train_data.select_dtypes(include=['object']):
    print(f"\nColumn: {col}")
    print(train_data[col].value_counts())

# Explore numerical features (histograms, box plots)
import matplotlib.pyplot as plt
import seaborn as sns

for col in train_data.select_dtypes(include=['int64', 'float64']):
    plt.figure(figsize=(8, 6))
    sns.histplot(train_data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.boxplot(y=train_data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Explore relationships between features and the target variable
sns.pairplot(train_data, hue='Credit_Score')
plt.show()

# Correlation matrix
correlation_matrix = train_data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
