# Data Preprocessing for "Give me Credit" Kaggle dataset

In [30]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [31]:
# Loading dataset
df = pd.read_csv("cs-training.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

In [32]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
5,6,0,0.213179,74,0,0.375607,3500.0,3,0,1,0,1.0
6,7,0,0.305682,57,0,5710.0,,8,0,3,0,0.0
7,8,0,0.754464,39,0,0.20994,3500.0,8,0,0,0,0.0
8,9,0,0.116951,27,0,46.0,,2,0,0,0,
9,10,0,0.189169,57,0,0.606291,23684.0,9,0,4,0,2.0


In [33]:
# Dropping column "Unnamed:0"
df.drop(columns=["Unnamed: 0"], inplace=True)

In [34]:
# Checking for missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtype

In [36]:
# Imputing missing values
# MonthlyIncome has 29731 missing values, so median (50% of structured sample) will be used to replace them
imputer_income = SimpleImputer(strategy='median')
df['MonthlyIncome'] = imputer_income.fit_transform(df[['MonthlyIncome']])

# NumberOfDependents has 3924 missing values, so mode will be used (the most frequent value)
imputer_dependents = SimpleImputer(strategy='most_frequent')
df['NumberOfDependents'] = imputer_dependents.fit_transform(df[['NumberOfDependents']])

In [37]:
# Checking for missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


In [38]:
df.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
5,0,0.213179,74,0,0.375607,3500.0,3,0,1,0,1.0
6,0,0.305682,57,0,5710.0,5400.0,8,0,3,0,0.0
7,0,0.754464,39,0,0.20994,3500.0,8,0,0,0,0.0
8,0,0.116951,27,0,46.0,5400.0,2,0,0,0,0.0
9,0,0.189169,57,0,0.606291,23684.0,9,0,4,0,2.0


In [39]:
# Removing unrealistic values (outliers)
# For instance, if age < 18 and credit utilization > 1.5 (150%)
df = df[df['age'] >= 18]
df = df[df['RevolvingUtilizationOfUnsecuredLines'] <= 1.5]

In [40]:
# Checking for class imbalance (0 and 1)
print("\nClass distribution (target variable):")
print(df['SeriousDlqin2yrs'].value_counts(normalize=True))


Class distribution (target variable):
SeriousDlqin2yrs
0    0.933935
1    0.066065
Name: proportion, dtype: float64


In [41]:
# Exporting raw cleaned version (for EDA and Feature Engineering)
df.to_csv("cleaned_credit_data_raw.csv",index=False)

In [43]:
# Feature Scaling (except target variable:SeriousDlqin2yrs)
features = df.drop(columns=['SeriousDlqin2yrs'])
target = df['SeriousDlqin2yrs']
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

In [44]:
# Recombining features with target
df_cleaned = pd.concat([target.reset_index(drop=True),features_scaled], axis=1)

In [45]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149399 entries, 0 to 149398
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      149399 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  149399 non-null  float64
 2   age                                   149399 non-null  float64
 3   NumberOfTime30-59DaysPastDueNotWorse  149399 non-null  float64
 4   DebtRatio                             149399 non-null  float64
 5   MonthlyIncome                         149399 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       149399 non-null  float64
 7   NumberOfTimes90DaysLate               149399 non-null  float64
 8   NumberRealEstateLoansOrLines          149399 non-null  float64
 9   NumberOfTime60-89DaysPastDueNotWorse  149399 non-null  float64
 10  NumberOfDependents                    149399 non-null  float64
dtype

In [46]:
df_cleaned.head(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1.274394,-0.495397,0.37603,-0.172596,0.209234,0.881107,-0.06324,4.407022,-0.057627,1.141014
1,0,1.817873,-0.833868,-0.100083,-0.17293,-0.295742,-0.867216,-0.06324,-0.901744,-0.057627,0.237536
2,0,0.967278,-0.969256,0.137974,-0.172948,-0.261509,-1.255732,0.176209,-0.901744,-0.057627,-0.665941
3,0,-0.240088,-1.51081,-0.100083,-0.172972,-0.241527,-0.672958,-0.06324,-0.901744,-0.057627,-0.665941
4,0,1.675871,-0.22462,0.137974,-0.172977,4.427801,-0.284442,-0.06324,-0.01695,-0.057627,-0.665941
5,0,-0.298785,1.467734,-0.100083,-0.172805,-0.226037,-1.061474,-0.06324,-0.01695,-0.057627,0.237536
6,0,-0.035605,0.316933,-0.100083,2.62557,-0.078881,-0.090184,-0.06324,1.752639,-0.057627,-0.665941
7,0,1.241212,-0.901562,-0.100083,-0.172887,-0.226037,-0.090184,-0.06324,-0.901744,-0.057627,-0.665941
8,0,-0.572561,-1.713892,-0.100083,-0.150444,-0.078881,-1.255732,-0.06324,-0.901744,-0.057627,-0.665941
9,0,-0.367094,0.316933,-0.100083,-0.172692,1.337221,0.104074,-0.06324,2.637433,-0.057627,1.141014


In [47]:
df_cleaned.tail(10)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
149389,0,-0.747341,-0.427703,-0.100083,-0.172691,-0.161366,-0.284442,-0.06324,-0.01695,-0.057627,1.141014
149390,0,-0.609089,0.452322,-0.100083,-0.172755,0.301865,0.298332,-0.06324,0.867845,-0.057627,-0.665941
149391,0,1.575544,-0.156926,-0.100083,1.852168,-0.078881,0.492591,-0.06324,-0.01695,-0.057627,2.044492
149392,0,1.939782,-2.052363,-0.100083,-0.172989,-0.433604,-1.44999,-0.06324,-0.901744,-0.057627,-0.665941
149393,0,0.192172,-0.156926,-0.100083,-0.172791,-0.233782,-0.284442,-0.06324,-0.901744,-0.057627,-0.665941
149394,0,-0.789574,1.467734,-0.100083,-0.172879,-0.334467,-0.867216,-0.06324,-0.01695,-0.057627,-0.665941
149395,0,-0.052497,-0.563091,-0.100083,-0.172638,-0.06463,-0.867216,-0.06324,-0.01695,-0.057627,1.141014
149396,0,-0.205281,0.384627,-0.100083,1.723758,-0.078881,1.852397,-0.06324,-0.01695,-0.057627,-0.665941
149397,0,-0.905295,-1.51081,-0.100083,-0.172989,-0.054407,-0.867216,-0.06324,-0.901744,-0.057627,-0.665941
149398,0,1.513826,0.790792,-0.100083,-0.172867,0.134727,-0.090184,-0.06324,0.867845,-0.057627,-0.665941


In [48]:
# Saving to CSV
df_cleaned.to_csv("cleaned_credit_data_scaled.csv", index=False)