In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

**Uploading Training cvs file**

In [4]:
df=pd.read_csv('GiveMeSomeCredit-training.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

**Removing Unnamed Column**

In [6]:
df = df.drop('Unnamed: 0', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtype

**Checking and filling Null spaces will median**

In [7]:
print(df.isnull().sum())

df["MonthlyIncome"].fillna(df["MonthlyIncome"].median(), inplace=True)
df["NumberOfDependents"].fillna(df["NumberOfDependents"].median(), inplace=True)


SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["MonthlyIncome"].fillna(df["MonthlyIncome"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["NumberOfDependents"].fillna(df["NumberOfDependents"].median(), inplace=True)


In [8]:
print(df.isnull().sum())

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64


**Feature Engineering**

In [9]:
df["TotalLatePayments"] = (
    df["NumberOfTime30-59DaysPastDueNotWorse"] +
    df["NumberOfTime60-89DaysPastDueNotWorse"] +
    df["NumberOfTimes90DaysLate"]
)

df["SevereLatePayments"] = (
    df["NumberOfTimes90DaysLate"] +
    df["NumberOfTime60-89DaysPastDueNotWorse"]
)

df["IncomePerDependent"] = df["MonthlyIncome"] / (df["NumberOfDependents"] + 1)


**Defining Features and Target**

In [10]:
X=df.drop('SeriousDlqin2yrs',axis=1)
y=df['SeriousDlqin2yrs']

**Train / Validation Split**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Traing Using Linear Regression**

In [12]:
model=LogisticRegression()
model.fit(X_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Evaluating the Model**

In [13]:
y_pred=model.predict(X_test)
y_prob=model.predict_proba(X_test)[:,1]

print(classification_report(y_test,y_pred))
print("ROC-AUC",roc_auc_score(y_test,y_prob))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     28044
           1       0.42      0.01      0.03      1956

    accuracy                           0.93     30000
   macro avg       0.68      0.51      0.50     30000
weighted avg       0.90      0.93      0.90     30000

ROC-AUC 0.6776103590063992


**Decision Tree**

In [14]:
dt=DecisionTreeClassifier(max_depth=5, min_samples_split=50, random_state=42)
dt.fit(X_train,y_train)
dt_pred=dt.predict(X_test)
dt_prob=dt.predict_proba(X_test)[:,1]

print(classification_report(y_test,dt_pred))
print("Decision Tree ROC-AUC",roc_auc_score(y_test,dt_prob))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     28044
           1       0.55      0.21      0.31      1956

    accuracy                           0.94     30000
   macro avg       0.75      0.60      0.64     30000
weighted avg       0.92      0.94      0.92     30000

Decision Tree ROC-AUC 0.8465285999593393


**Random Forest**

In [15]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=50,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, rf_pred))
print("Random Forest ROC-AUC:", roc_auc_score(y_test, rf_prob))


              precision    recall  f1-score   support

           0       0.95      0.99      0.97     28044
           1       0.60      0.18      0.28      1956

    accuracy                           0.94     30000
   macro avg       0.77      0.59      0.62     30000
weighted avg       0.92      0.94      0.92     30000

Random Forest ROC-AUC: 0.8631879763730906


**Finding Best Parameters of ReandomForest**

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "n_estimators": [200, 300, 500],
    "max_depth": [8, 10, 12, None],
    "min_samples_split": [10, 30, 50],
    "min_samples_leaf": [1, 5, 10],
    "class_weight": ["balanced"]
}


In [17]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=10,          # cut in half
    scoring="roc_auc",
    cv=3,               # fewer folds
    verbose=1,
    random_state=42,
    n_jobs=-1
)


In [18]:
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

print("Best parameters:")
print(random_search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best parameters:
{'n_estimators': 500, 'min_samples_split': 30, 'min_samples_leaf': 10, 'max_depth': 12, 'class_weight': 'balanced'}


**Evalutaing the Tuned Model**

In [19]:
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("Tuned RF ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.98      0.86      0.91     28044
           1       0.26      0.69      0.37      1956

    accuracy                           0.85     30000
   macro avg       0.62      0.78      0.64     30000
weighted avg       0.93      0.85      0.88     30000

Tuned RF ROC-AUC: 0.8629082979157205


**Feature Importance**

In [20]:
importances = pd.Series(
    best_rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(importances.head(10))

RevolvingUtilizationOfUnsecuredLines    0.251391
TotalLatePayments                       0.212573
SevereLatePayments                      0.116573
NumberOfTime30-59DaysPastDueNotWorse    0.076708
NumberOfTimes90DaysLate                 0.064230
age                                     0.057323
DebtRatio                               0.051138
IncomePerDependent                      0.038265
NumberOfOpenCreditLinesAndLoans         0.036373
MonthlyIncome                           0.035493
dtype: float64


**Implementing On testing.csv**

In [22]:
test = pd.read_csv("GiveMeSomeCredit-testing.csv")
test = test.drop(columns=["Unnamed: 0", "SeriousDlqin2yrs"], errors="ignore")

test["MonthlyIncome"] = test["MonthlyIncome"].fillna(df["MonthlyIncome"].median())
test["NumberOfDependents"] = test["NumberOfDependents"].fillna(df["NumberOfDependents"].median())

# Apply the same feature engineering steps to the test set
test["TotalLatePayments"] = (
    test["NumberOfTime30-59DaysPastDueNotWorse"] +
    test["NumberOfTime60-89DaysPastDueNotWorse"] +
    test["NumberOfTimes90DaysLate"]
)

test["SevereLatePayments"] = (
    test["NumberOfTimes90DaysLate"] +
    test["NumberOfTime60-89DaysPastDueNotWorse"]
)

test["IncomePerDependent"] = test["MonthlyIncome"] / (test["NumberOfDependents"] + 1)

test_probs = best_rf.predict_proba(test)[:, 1]

submission = pd.DataFrame({
    "Id": test.index + 1,
    "ProbabilityOfDefault": test_probs
})

submission.to_csv("credit_predictions.csv", index=False)