In [2]:
import pandas as pd
import matplotlib
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV

In [3]:
# Load CSV files
df = pd.read_csv('C:/Users/Behzad/Documents/GitHub/Credit_Score_Prediction/Datasets/train_cleaned.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Occ_Media_Manager,Occ_Musician,Occ_Scientist,Occ_Teacher,Occ_Unknown,Occ_Writer,income_zscore,Annual_Income_log,Monthly_Inhand_Salary_log,Delay_Zscore
0,0,0x1602,CUS_0xd40,January,Aaron Maashoh,23.0,821-00-0265,19114,1824.843333,3,...,0,0,1,0,0,0,-0.744333,9.858229,7.509797,-1.215926
1,1,0x1603,CUS_0xd40,February,Aaron Maashoh,23.0,821-00-0265,19114,3093.745,3,...,0,0,1,0,0,0,-0.315848,9.858229,8.037461,-1.485103
2,2,0x1604,CUS_0xd40,March,Aaron Maashoh,33.0,821-00-0265,19114,3093.745,3,...,0,0,1,0,0,0,-0.315848,9.858229,8.037461,-1.215926
3,3,0x1605,CUS_0xd40,April,Aaron Maashoh,23.0,821-00-0265,19114,3093.745,3,...,0,0,1,0,0,0,-0.315848,9.858229,8.037461,-1.081337
4,4,0x1606,CUS_0xd40,May,Aaron Maashoh,23.0,821-00-0265,19114,1824.843333,3,...,0,0,1,0,0,0,-0.744333,9.858229,7.509797,-1.014043


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 68 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   100000 non-null  int64  
 1   ID                           100000 non-null  object 
 2   Customer_ID                  100000 non-null  object 
 3   Month                        100000 non-null  object 
 4   Name                         100000 non-null  object 
 5   Age                          100000 non-null  float64
 6   SSN                          100000 non-null  object 
 7   Annual_Income                100000 non-null  int64  
 8   Monthly_Inhand_Salary        100000 non-null  float64
 9   Num_Bank_Accounts            100000 non-null  int64  
 10  Num_Credit_Card              100000 non-null  int64  
 11  Interest_Rate                100000 non-null  int64  
 12  Num_of_Loan                  100000 non-null  int64  
 13  

In [None]:
# Define column groups based on dataset
num_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card",
    "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment",
    "Changed_Credit_Limit", "Num_Credit_Inquiries", "Outstanding_Debt", "Credit_Utilization_Ratio",
    "Credit_History_Age", "Total_EMI_per_month", "Amount_invested_monthly", "Monthly_Balance",
    
    # Time-based Numerical Features
    "Month_num", "Month_sin", "Month_cos",
    
    # Engineered Features (Statistical Transformations)
    "income_zscore", "Annual_Income_log", "Monthly_Inhand_Salary_log", "Delay_Zscore"
]

cat_cols = ["Month", "Credit_Mix",  "Payment_of_Min_Amount"]

one_hot_cols = [
    "Auto Loan", "Credit-Builder Loan", "Personal Loan", "Not Specified", "Debt Consolidation Loan",
    "Payday Loan", "Student Loan", "Home Equity Loan", "Mortgage Loan", "No Loan"
    
    "Occ_Architect", "Occ_Developer", "Occ_Doctor", "Occ_Engineer", "Occ_Entrepreneur", "Occ_Journalist",
    "Occ_Lawyer", "Occ_Manager", "Occ_Mechanic", "Occ_Media_Manager", "Occ_Musician", "Occ_Scientist",
    "Occ_Teacher", "Occ_Unknown", "Occ_Writer"
]


target_col = ["Credit_Score"]

In [5]:
# Encode target variable (Credit Score)
label_encoder = LabelEncoder()
df["Credit_Score"] = label_encoder.fit_transform(df["Credit_Score"])

In [6]:
# Apply Label Encoding to each column
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [7]:
# Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols)
])

In [8]:
# Apply transformation
X_transformed = preprocessor.fit_transform(df.drop(columns=target_col))
df_scaled = pd.DataFrame(X_transformed, columns=preprocessor.get_feature_names_out())


In [9]:
X = df_scaled.select_dtypes(exclude=['object'])

In [10]:
# Target variable
y = df["Credit_Score"]

In [11]:
# Define model
model1 = RandomForestClassifier(random_state=42)

In [12]:
# Apply Recursive Feature Elimination (RFE)
rfe = RFE(estimator = model1, n_features_to_select=2)
X_selected = rfe.fit_transform(X, y)

In [13]:
# Get selected features
selected_features = X.columns[rfe.support_]
print(f"Selected Features: {selected_features.tolist()}")

Selected Features: ['num__Outstanding_Debt', 'num__Credit_Utilization_Ratio']


In [14]:
# Define model
model2 = RandomForestClassifier(random_state=42)

In [15]:
# Train a random forest model
model2.fit(X, y)

In [16]:
# Get feature importance scores
feature_importances = model2.feature_importances_

# Rank features
feature_ranking = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

In [17]:
# Display top features
print("Feature Importances:")
print(feature_ranking)

Feature Importances:
num__Outstanding_Debt             0.107338
num__Interest_Rate                0.087533
num__Delay_from_due_date          0.050210
num__Changed_Credit_Limit         0.048245
num__Delay_Zscore                 0.046839
num__Num_Credit_Card              0.044969
num__Num_Credit_Inquiries         0.044967
num__Num_of_Delayed_Payment       0.044453
num__Credit_History_Age           0.042153
num__Monthly_Balance              0.040169
num__Total_EMI_per_month          0.040002
num__Credit_Utilization_Ratio     0.039961
num__Amount_invested_monthly      0.038932
num__Annual_Income                0.038055
num__Num_Bank_Accounts            0.038019
num__Age                          0.037828
num__Annual_Income_log            0.037820
num__income_zscore                0.030824
num__Monthly_Inhand_Salary_log    0.030751
num__Monthly_Inhand_Salary        0.030487
num__Num_of_Loan                  0.025439
num__Month_num                    0.020850
num__Month_cos                   

In [18]:
# Perform Lasso Feature Selection with Cross-Validation
lasso = LassoCV(cv=5, random_state=42)  # 5-fold cross-validation
lasso.fit(X, y)

In [19]:
# Get the coefficients of the selected features
lasso_coeffs = pd.Series(lasso.coef_, index=X.columns)

# Filter out features with non-zero coefficients (important features)
selected_features = lasso_coeffs[lasso_coeffs != 0].sort_values(ascending=False)

In [22]:
# Display selected features
print(f"Selected Features: {selected_features}")

Selected Features: num__Changed_Credit_Limit         0.143203
num__Num_of_Delayed_Payment       0.124748
num__Num_Bank_Accounts            0.039877
num__Monthly_Inhand_Salary_log    0.039418
num__Interest_Rate                0.028168
num__Num_Credit_Card              0.028027
num__Month_cos                    0.023580
num__Monthly_Balance              0.015961
num__Delay_from_due_date          0.014729
num__Annual_Income                0.011435
num__Month_sin                    0.006901
num__Month_num                    0.001975
num__Credit_Utilization_Ratio     0.000142
num__Amount_invested_monthly     -0.000718
num__Num_of_Loan                 -0.002592
num__Total_EMI_per_month         -0.002680
num__Annual_Income_log           -0.015670
num__Age                         -0.016246
num__Credit_History_Age          -0.020122
num__Num_Credit_Inquiries        -0.035409
num__Monthly_Inhand_Salary       -0.051123
num__Outstanding_Debt            -0.139928
dtype: float64
