In [1]:
import torch
import pandas as pd
import numpy as np 

# EDA

In [2]:
df = pd.read_csv("Data/train.csv")
df.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
dtypes: float64(5), int64(2), object(6)
memory usage: 58.9+ MB


In [4]:
df["education_level"].value_counts()

education_level
Bachelor's     279606
High School    183592
Master's        93097
Other           26677
PhD             11022
Name: count, dtype: int64

In [5]:
df["employment_status"].value_counts()

employment_status
Employed         450645
Unemployed        62485
Self-employed     52480
Retired           16453
Student           11931
Name: count, dtype: int64

In [6]:
df["loan_purpose"].value_counts()

loan_purpose
Debt consolidation    324695
Other                  63874
Car                    58108
Home                   44118
Education              36641
Business               35303
Medical                22806
Vacation                8449
Name: count, dtype: int64

In [7]:
df["grade_subgrade"].value_counts()

grade_subgrade
C3    58695
C4    55957
C2    54443
C1    53363
C5    53317
D1    37029
D3    36694
D4    35097
D2    34432
D5    32101
B2    15167
B1    14344
B5    13937
B3    13926
B4    13877
E4     8036
E3     7075
E1     6891
E2     6372
E5     6084
F5     5947
F4     5535
F1     5534
F2     5203
F3     5082
A5     2471
A3     2066
A2     2018
A4     1701
A1     1600
Name: count, dtype: int64

In [8]:
len(df["grade_subgrade"])

593994

# Observations : 
- id remove
- Range of data varies -> resonates with Standard Scaler 
- Cat Cols
> - Marital Status -> 0/1 1 if married , 0 if not 
> - Education level -> One Hot (Design Choice : Multiple Categories)
> - Employment Status -> One Hot (Design Choice : Multiple Categories)
> - Loan Purpose -> One Hot (Design Choice : Multiple Categories)
> - GradeSubgrade -> we would make 2 features one is Gradetype where we store the Letter Grade (A,B,C) and a SubGrade Type (1,2,3): As strings as they are category and then use One Hot to classify them as category 

-> HeHe, This is all the basic things you can notice from the Data above. I think we might be able to make more deduction once we enter the realm of statistics.....

# Preprocessing Data with Pipelines

## Preprocessing Functions 

In [9]:
# function to grade/subgrade division and marital Status 
def marital_grade(df:pd.DataFrame)->pd.DataFrame:
    df = df.copy()
    df["marital_status"] = (df["marital_status"] == "Married").astype(int)
    df["grade"] = [g[0] for g in df["grade_subgrade"]]
    df["sub_grade"] = [g[1] for g in df["grade_subgrade"]]
    df.drop(columns=["grade_subgrade"], inplace=True)
    return df

In [10]:
X = df.drop(columns=['loan_paid_back'])
y = df['loan_paid_back']

In [11]:
# test the function : 
df = marital_grade(df)
df.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,loan_paid_back,grade,sub_grade
0,0,29367.99,0.084,736,2528.42,13.67,Female,0,High School,Self-employed,Other,1.0,C,3
1,1,22108.02,0.166,636,4593.1,12.92,Male,1,Master's,Employed,Debt consolidation,0.0,D,3
2,2,49566.2,0.097,694,17005.15,9.76,Male,0,High School,Employed,Debt consolidation,1.0,C,5
3,3,46858.25,0.065,533,4682.48,16.1,Female,0,High School,Employed,Debt consolidation,1.0,F,1
4,4,25496.7,0.053,665,12184.43,10.21,Male,1,High School,Employed,Other,1.0,D,1


In [12]:
NUM_FEATURES = df.select_dtypes(include=["int64","Float64"]).columns.to_list()
NUM_FEATURES.remove("loan_paid_back")
NUM_FEATURES.remove("id")
CAT_FEATURES = df.select_dtypes(include=["object"]).columns.to_list()

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,FunctionTransformer
import joblib
from sklearn.compose import ColumnTransformer

encoder = OneHotEncoder()
scaler = StandardScaler()
cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore',sparse_output=True)),
])

num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])


## Applying those Functions

In [14]:
# Write the preprocessing Pipeline -> basic Pipeline Nothing Fancy Here 
preprocessing_pipeline = ColumnTransformer([
    ('cat', cat_pipeline, CAT_FEATURES),
    ('num', num_pipeline, NUM_FEATURES)
])
if not isinstance(df, pd.DataFrame):
    df = pd.DataFrame(df)
df = preprocessing_pipeline.fit_transform(df)
df = pd.DataFrame(df)

In [15]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-0.705461,-0.535135,0.993849,-1.803484,0.653899,-0.935547
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,-0.977248,0.660668,-0.810394,-1.505401,0.280571,1.068894
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.050689,-0.345556,0.236067,0.286558,-1.292385,-0.935547
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,-0.050687,-0.812211,-2.668764,-1.492497,1.863482,-0.935547
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,-0.850388,-0.987206,-0.287163,-0.409421,-1.068388,1.068894


# Observations : 
- Pipeline Works ... 
- Okay , with respect to the fact that we only need the Num Data for model i think it work 
- Now we need to split the data and train the Model
- there 38 features in the dataset

# Splitting the Data

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

In [17]:
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

Train: (475195, 12), Validation: (59399, 12), Test: (59400, 12)


In [18]:
def preprocess_fn(df:pd.DataFrame)->pd.DataFrame:
    df = df.copy()
    df = marital_grade(df)
    df = preprocessing_pipeline.transform(df)
    return df

In [19]:
X_train_prep = preprocess_fn(X_train)
X_val_prep   = preprocess_fn(X_val)
X_test_prep  = preprocess_fn(X_test)

print("Train:", X_train_prep.shape)
print("Val:", X_val_prep.shape)
print("Test:", X_test_prep.shape)

Train: (475195, 38)
Val: (59399, 38)
Test: (59400, 38)


# Model Training 

# Generate Submission File

In [29]:
# Load the test data
test_df = pd.read_csv("Data/test.csv")
print(f"Test data shape: {test_df.shape}")
test_df.head()

Test data shape: (254569, 12)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


In [30]:
# Store the IDs before preprocessing
test_ids = test_df['id'].values
print(f"Number of test samples: {len(test_ids)}")

Number of test samples: 254569


In [31]:
# Preprocess the test data
# Remove the id column before preprocessing
test_features = test_df.drop(columns=['id'])
test_preprocessed = preprocess_fn(test_features)
print(f"Preprocessed test data shape: {test_preprocessed.shape}")

Preprocessed test data shape: (254569, 38)


In [32]:
# Load the best model from GridSearchCV
import joblib

best_model = joblib.load("xgboost_gpu_best_model.pkl")
print("✅ Best model loaded successfully!")

✅ Best model loaded successfully!


In [33]:
# Make predictions on the test data
# For binary classification, we predict class labels (0 or 1)
test_predictions = best_model.predict(test_preprocessed)

print(f"Predictions shape: {test_predictions.shape}")
print(f"Unique predictions: {np.unique(test_predictions, return_counts=True)}")

Predictions shape: (254569,)
Unique predictions: (array([0, 1]), array([ 34937, 219632]))


In [34]:
# Create the submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'loan_paid_back': test_predictions
})

# Verify the submission format
print(f"Submission shape: {submission.shape}")
submission.head(10)

Submission shape: (254569, 2)


Unnamed: 0,id,loan_paid_back
0,593994,1
1,593995,1
2,593996,1
3,593997,1
4,593998,1
5,593999,1
6,594000,1
7,594001,1
8,594002,1
9,594003,0


In [35]:
# Save the submission file
submission.to_csv('submission.csv', index=False)
print("✅ Submission file saved as 'submission.csv'")

# Verify the file was created
import os
if os.path.exists('submission.csv'):
    file_size = os.path.getsize('submission.csv')
    print(f"File size: {file_size:,} bytes")
    print(f"Number of predictions: {len(submission)}")

✅ Submission file saved as 'submission.csv'
File size: 2,291,139 bytes
Number of predictions: 254569


In [36]:
# Quick sanity check: compare with sample submission
sample_submission = pd.read_csv("Data/sample_submission.csv")
print(f"Sample submission shape: {sample_submission.shape}")
print(f"Our submission shape: {submission.shape}")
print(f"\nColumns match: {list(submission.columns) == list(sample_submission.columns)}")
print(f"IDs match: {(submission['id'].values == sample_submission['id'].values).all()}")

Sample submission shape: (254569, 2)
Our submission shape: (254569, 2)

Columns match: True
IDs match: True
