In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pylab as plt

df = pd.read_csv('train.csv')
print ("Number of Columns: ", df.shape[1])
print ("Number of Rows: ", df.shape[0])

Number of Columns:  28
Number of Rows:  316970


In [2]:
# Drop the 'Unnamed: 0' column
df = df.drop(columns=['Unnamed: 0'])

In [3]:
# Fill missing values with 'Unknown' for categorical columns
df['emp_title'] = df['emp_title'].fillna('Unknown')
df['title'] = df['title'].fillna('Unknown')

# Fill missing values for emp_length with '< 1 year'
df['emp_length'] = df['emp_length'].fillna('< 1 year')

# Drop rows with missing values for these
df = df.dropna(subset=['revol_util', 'mort_acc', 'pub_rec_bankruptcies'])

In [4]:
print ("Number of Columns: ", df.shape[1])
print ("Number of Rows: ", df.shape[0])

Number of Columns:  27
Number of Rows:  286584


In [5]:
import datetime
from dateutil.relativedelta import relativedelta

target_date = datetime.datetime(2025, 3, 1)

# Month name to number mapping
month_map = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Function to parse dates in MON-YR format and calculate months difference
def calculate_months_from_mar_2025(date_str):
    parts = date_str.split('-')
    month = month_map[parts[0]]
    year = int(parts[1])
    
    # Handle 2-digit years
    if year < 25:
        year += 2000  # Assume 20xx for years less than 25
    elif year < 100:
        year += 1900  # Assume 19xx for years between 25 and 99
    
    date_from = datetime.datetime(year, month, 1)
    diff = relativedelta(target_date, date_from)
    return diff.years * 12 + diff.months

# Calculate months difference from each date to the target date
df['cr_months_from_mar_2025'] = df['earliest_cr_line'].apply(calculate_months_from_mar_2025)

df[['cr_months_from_mar_2025', 'earliest_cr_line']]



Unnamed: 0,cr_months_from_mar_2025,earliest_cr_line
0,256,Nov-2003
1,309,Jun-1999
2,317,Oct-1998
3,303,Dec-1999
4,440,Jul-1988
...,...,...
316964,588,Mar-1976
316965,294,Sep-2000
316966,271,Aug-2002
316967,282,Sep-2001


In [6]:
df['issue_months_from_mar_2025'] = df['issue_d'].apply(calculate_months_from_mar_2025)

df[['issue_months_from_mar_2025', 'issue_d']]

Unnamed: 0,issue_months_from_mar_2025,issue_d
0,137,Oct-2013
1,133,Feb-2014
2,108,Mar-2016
3,136,Nov-2013
4,150,Sep-2012
...,...,...
316964,136,Nov-2013
316965,124,Nov-2014
316966,117,Jun-2015
316967,115,Aug-2015


In [7]:
df['cr_months_from_mar_2025'].describe()

count    286584.000000
mean        320.644534
std          86.886285
min         137.000000
25%         261.000000
50%         304.000000
75%         364.000000
max         974.000000
Name: cr_months_from_mar_2025, dtype: float64

In [8]:
df['issue_months_from_mar_2025'].describe()

count    286584.000000
mean        129.030260
std          12.849471
min          99.000000
25%         119.000000
50%         129.000000
75%         139.000000
max         156.000000
Name: issue_months_from_mar_2025, dtype: float64

In [9]:
# Define target variable
y_target = df['loan_status']
y_target = df['loan_status'].map({'Charged Off': 1, 'Fully Paid': 0})


# Define feature set, excluding 'loan_status' and 'grade' (already in 'sub_grade')
X_features = df.drop(columns=['loan_status', 'grade', 'issue_d', 'earliest_cr_line', 'address', 'emp_title', 'title', 'issue_d', 'earliest_cr_line'])

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler

# Define categorical feature groups
onehot_ftrs = ['term', 'home_ownership', 'verification_status', 
               'purpose', 'initial_list_status', 
               'application_type']

ordinal_ftrs = ['sub_grade', 'emp_length']
ordinal_cats = [['G5', 'G4', 'G3', 'G2', 'G1', 'F5', 'F4', 'F3', 'F2', 'F1', 
                 'E5', 'E4', 'E3', 'E2', 'E1', 'D5', 'D4', 'D3', 'D2', 'D1', 
                 'C5', 'C4', 'C3', 'C2', 'C1', 'B5', 'B4', 'B3', 'B2', 'B1', 
                 'A5', 'A4', 'A3', 'A2', 'A1'], 
                ['< 1 year', '1 year', '2 years', '3 years', '4 years', 
                 '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']]

minmax_ftrs = ['loan_amnt', 'int_rate', 'installment', 'dti', 
               'open_acc', 'pub_rec', 'revol_util', 'total_acc', 
               'mort_acc', 'pub_rec_bankruptcies', 'cr_months_from_mar_2025', 'issue_months_from_mar_2025']

standard_ftrs = ['annual_inc', 'revol_bal']

print("OneHot Features:", len(onehot_ftrs))
print("Ordinal Features:", len(ordinal_ftrs))
print("MinMax Features:", len(minmax_ftrs))
print("Standard Features:", len(standard_ftrs))

# Define preprocessor correctly
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('standard', StandardScaler(), standard_ftrs) 
    ])

# Define pipeline correctly
clf = Pipeline(steps=[('preprocessor', preprocessor)])



# Fit and transform the data using the correct pipeline
X_transformed = clf.named_steps['preprocessor'].fit_transform(X_features)

# Get the shape of the transformed X
print("Shape of the transformed X:", X_transformed.shape)


OneHot Features: 6
Ordinal Features: 2
MinMax Features: 12
Standard Features: 2
Shape of the transformed X: (286584, 46)


In [12]:
import numpy as np

# Get feature names for each transformer in ColumnTransformer
ordinal_names = preprocessor.transformers_[0][1].get_feature_names_out(ordinal_ftrs)
onehot_names = preprocessor.transformers_[1][1].get_feature_names_out(onehot_ftrs)
minmax_names = np.array(minmax_ftrs)  
standard_names = np.array(standard_ftrs)  

# Concatenate all feature names
all_feature_names = np.concatenate([ordinal_names, onehot_names, minmax_names, standard_names])

# Print the feature names
print("Transformed Feature Names (X_transformed):")
for i, name in enumerate(all_feature_names, 1):
    print(f"{i}. {name}")


Transformed Feature Names (X_transformed):
1. sub_grade
2. emp_length
3. term_ 36 months
4. term_ 60 months
5. home_ownership_ANY
6. home_ownership_MORTGAGE
7. home_ownership_NONE
8. home_ownership_OTHER
9. home_ownership_OWN
10. home_ownership_RENT
11. verification_status_Not Verified
12. verification_status_Source Verified
13. verification_status_Verified
14. purpose_car
15. purpose_credit_card
16. purpose_debt_consolidation
17. purpose_educational
18. purpose_home_improvement
19. purpose_house
20. purpose_major_purchase
21. purpose_medical
22. purpose_moving
23. purpose_other
24. purpose_renewable_energy
25. purpose_small_business
26. purpose_vacation
27. purpose_wedding
28. initial_list_status_f
29. initial_list_status_w
30. application_type_DIRECT_PAY
31. application_type_INDIVIDUAL
32. application_type_JOINT
33. loan_amnt
34. int_rate
35. installment
36. dti
37. open_acc
38. pub_rec
39. revol_util
40. total_acc
41. mort_acc
42. pub_rec_bankruptcies
43. cr_months_from_mar_2025
44.

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Define the number of random states
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []

# Loop through different random states
for i in range(nr_states):
    print('Random state ' + str(i+1))
    
    # First split: Training and remaining dataset (Stratified)
    X_train, X_other, y_train, y_other = train_test_split(
        X_transformed, y_target, train_size=0.6, random_state=42+i, stratify=y_target
    )
    
    # Second split: Validation and test set (Stratified)
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=0.5, random_state=42+i, stratify=y_other
    )
    
    # Define hyperparameter grid
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']  # 'liblinear' supports both l1 and l2 penalties
    }
    
    # Store validation scores
    val_scores = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    
    # Loop through hyperparameter combinations
    for p, params in enumerate(ParameterGrid(param_grid)):
        print('  ', params)
        clf = LogisticRegression(**params, random_state=42+i, class_weight='balanced', max_iter=1000)
        clf.fit(X_train, y_train)
        
        # Predictions and F1 score calculations
        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)
        
        train_score = f1_score(y_train, y_train_pred, average='weighted')
        val_scores[p] = f1_score(y_val, y_val_pred, average='weighted')
        
        print('  Train F1 Score:', train_score, 'Validation F1 Score:', val_scores[p])
        models.append(clf)
    
    # Select the best model based on validation F1 score
    best_model_idx = np.argmax(val_scores)
    final_models.append(models[best_model_idx])
    print('Best model parameters:', ParameterGrid(param_grid)[best_model_idx])
    print('Corresponding validation F1 score:', np.max(val_scores))
    
    # Evaluate the best model on the test set
    y_test_pred = final_models[-1].predict(X_test)
    test_scores[i] = f1_score(y_test, y_test_pred, average='weighted')
    print('Test F1 score:', test_scores[i])

# Display final test scores
print('Final test F1 scores:', test_scores)


Random state 1
   {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
  Train F1 Score: 0.6988430509283448 Validation F1 Score: 0.6992948714150761
   {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}
  Train F1 Score: 0.6950163961258269 Validation F1 Score: 0.6958566903294121
   {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
  Train F1 Score: 0.6942669106857882 Validation F1 Score: 0.6943690384430652
   {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
  Train F1 Score: 0.6960455713437255 Validation F1 Score: 0.6959842766844593
   {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
  Train F1 Score: 0.7024106767992642 Validation F1 Score: 0.7026361031931873
   {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
  Train F1 Score: 0.700673281109374 Validation F1 Score: 0.7003075299073298
   {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
  Train F1 Score: 0.7049653605229952 Validation F1 Score: 0.7057429272880819
   {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
  Train F1 Sc

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Define the number of random states
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []

# Loop through different random states
for i in range(nr_states):
    print('Random state ' + str(i+1))
    
    # First split: Training and remaining dataset (Stratified)
    X_train, X_other, y_train, y_other = train_test_split(
        X_transformed, y_target, train_size=0.6, random_state=42+i, stratify=y_target
    )
    
    # Second split: Validation and test set (Stratified)
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=0.5, random_state=42+i, stratify=y_other
    )
    
    # Define hyperparameter grid
    param_grid = {
        'max_depth': [3, 10, 30, 100],
        'max_features': [0.25, 0.5, 0.75, 1.0]
    }
    
    # Store validation scores
    val_scores = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    
    # Loop through hyperparameter combinations
    for p, params in enumerate(ParameterGrid(param_grid)):
        print('  ', params)
        clf = RandomForestClassifier(**params, random_state=42+i, n_jobs=-1, class_weight='balanced')
        clf.fit(X_train, y_train)
        
        # Predictions and F1 score calculations
        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)
        
        train_score = f1_score(y_train, y_train_pred, average='weighted')
        val_scores[p] = f1_score(y_val, y_val_pred, average='weighted')
        
        print('  Train F1 Score:', train_score, 'Validation F1 Score:', val_scores[p])
        models.append(clf)
    
    # Select the best model based on validation F1 score
    best_model_idx = np.argmax(val_scores)
    final_models.append(models[best_model_idx])
    print('Best model parameters:', ParameterGrid(param_grid)[best_model_idx])
    print('Corresponding validation F1 score:', np.max(val_scores))
    
    # Evaluate the best model on the test set
    y_test_pred = final_models[-1].predict(X_test)
    test_scores[i] = f1_score(y_test, y_test_pred, average='weighted')
    print('Test F1 score:', test_scores[i])

# Display final test scores
print('Final test F1 scores:', test_scores)


Random state 1
   {'max_depth': 3, 'max_features': 0.25}
  Train F1 Score: 0.7082177176611554 Validation F1 Score: 0.7082273740625324
   {'max_depth': 3, 'max_features': 0.5}
  Train F1 Score: 0.7082177176611554 Validation F1 Score: 0.7082273740625324
   {'max_depth': 3, 'max_features': 0.75}
  Train F1 Score: 0.7082177176611554 Validation F1 Score: 0.7082273740625324
   {'max_depth': 3, 'max_features': 1.0}
  Train F1 Score: 0.7082177176611554 Validation F1 Score: 0.7082273740625324
   {'max_depth': 10, 'max_features': 0.25}
  Train F1 Score: 0.740586742252267 Validation F1 Score: 0.7303793724033548
   {'max_depth': 10, 'max_features': 0.5}
  Train F1 Score: 0.7468425575482024 Validation F1 Score: 0.7356623202830181
   {'max_depth': 10, 'max_features': 0.75}
  Train F1 Score: 0.7460137793139934 Validation F1 Score: 0.7353930522915101
   {'max_depth': 10, 'max_features': 1.0}
  Train F1 Score: 0.7467599357524586 Validation F1 Score: 0.7351490576536244
   {'max_depth': 30, 'max_features

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Define the number of random states
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []

# Loop through different random states
for i in range(nr_states):
    print('Random state ' + str(i+1))

    # Reduce dataset size to 15% while keeping class distribution balanced (Stratified)
    X_reduced, _, y_reduced, _ = train_test_split(
        X_transformed, y_target, train_size=0.15, random_state=42+i, stratify=y_target
    )

    # First split: Training and remaining dataset (Stratified)
    X_train, X_other, y_train, y_other = train_test_split(
        X_reduced, y_reduced, train_size=0.6, random_state=42+i, stratify=y_reduced
    )

    # Second split: Validation and test set (50% of remaining → 3.75% val, 3.75% test) (Stratified)
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=0.5, random_state=42+i, stratify=y_other
    )
    
    # Define hyperparameter grid for SVC
    param_grid = {
        'C': [0.1, 1, 10, 100], 
        'kernel': ['linear', 'rbf'], 
        'gamma': ['scale']
    }

    # Store validation scores
    val_scores = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    
    # Loop through hyperparameter combinations
    for p, params in enumerate(ParameterGrid(param_grid)):
        print('  ', params)
        clf = SVC(**params, random_state=42+i, class_weight='balanced')
        clf.fit(X_train, y_train)
        
        # Predictions and F1 score calculations
        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)
        
        train_score = f1_score(y_train, y_train_pred, average='weighted')
        val_scores[p] = f1_score(y_val, y_val_pred, average='weighted')
        
        print('  Train F1 Score:', train_score, 'Validation F1 Score:', val_scores[p])
        models.append(clf)
    
    # Select the best model based on validation F1 score
    best_model_idx = np.argmax(val_scores)
    final_models.append(models[best_model_idx])
    print('Best model parameters:', ParameterGrid(param_grid)[best_model_idx])
    print('Corresponding validation F1 score:', np.max(val_scores))
    
    # Evaluate the best model on the test set
    y_test_pred = final_models[-1].predict(X_test)
    test_scores[i] = f1_score(y_test, y_test_pred, average='weighted')
    print('Test F1 score:', test_scores[i])

# Display final test scores
print('Final test F1 scores:', test_scores)


Random state 1
   {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
  Train F1 Score: 0.7082390834804694 Validation F1 Score: 0.7082818180984234
   {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
  Train F1 Score: 0.7082390834804694 Validation F1 Score: 0.7082818180984234
   {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
  Train F1 Score: 0.7082390834804694 Validation F1 Score: 0.7082818180984234
   {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
  Train F1 Score: 0.7082390834804694 Validation F1 Score: 0.7082818180984234
   {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
  Train F1 Score: 0.7082390834804694 Validation F1 Score: 0.7082818180984234
   {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
  Train F1 Score: 0.7082390834804694 Validation F1 Score: 0.7082818180984234
   {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
  Train F1 Score: 0.7084246581375878 Validation F1 Score: 0.7082818180984234
   {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
  Train F1 Score: 0.7082390834804694 Validation 

In [24]:
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Define the number of random states
nr_states = 5
test_scores = np.zeros(nr_states)
final_models = []

# Loop through different random states
for i in range(nr_states):
    print('Random state ' + str(i+1))
    
    # First split: Training and remaining dataset (Stratified)
    X_train, X_other, y_train, y_other = train_test_split(
        X_transformed, y_target, train_size=0.6, random_state=42+i, stratify=y_target
    )
    
    # Second split: Validation and test set (Stratified)
    X_val, X_test, y_val, y_test = train_test_split(
        X_other, y_other, train_size=0.5, random_state=42+i, stratify=y_other
    )
    
    # Define hyperparameter grid
    param_grid = {
        'learning_rate': [0.01,0.1, 0.3],  
        'max_depth': [3, 10],  
        'n_estimators': [50, 200],   
        'min_child_weight': [1, 5], 
        'gamma': [0, 0.1, 1], 
        'reg_lambda': [1, 10]   
}
    
    # Store validation scores
    val_scores = np.zeros(len(ParameterGrid(param_grid)))
    models = []
    
    # Loop through hyperparameter combinations
    for p, params in enumerate(ParameterGrid(param_grid)):
        print('  ', params)
        clf = XGBClassifier(**params, random_state=42+i, eval_metric='logloss', scale_pos_weight=4.95)
        clf.fit(X_train, y_train)
        
        # Predictions and F1 score calculations
        y_train_pred = clf.predict(X_train)
        y_val_pred = clf.predict(X_val)
        
        train_score = f1_score(y_train, y_train_pred, average='weighted')
        val_scores[p] = f1_score(y_val, y_val_pred, average='weighted')
        
        print('  Train F1 Score:', train_score, 'Validation F1 Score:', val_scores[p])
        models.append(clf)
    
    # Select the best model based on validation F1 score
    best_model_idx = np.argmax(val_scores)
    final_models.append(models[best_model_idx])
    print('Best model parameters:', ParameterGrid(param_grid)[best_model_idx])
    print('Corresponding validation F1 score:', np.max(val_scores))
    
    # Evaluate the best model on the test set
    y_test_pred = final_models[-1].predict(X_test)
    test_scores[i] = f1_score(y_test, y_test_pred, average='weighted')
    print('Test F1 score:', test_scores[i])

# Display final test scores
print('Final test F1 scores:', test_scores)


Random state 1
   {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'reg_lambda': 1}
  Train F1 Score: 0.4669446709494083 Validation F1 Score: 0.46777383170123316
   {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'reg_lambda': 10}
  Train F1 Score: 0.4664870325177484 Validation F1 Score: 0.4672613893647195
   {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'reg_lambda': 1}
  Train F1 Score: 0.6074757723055344 Validation F1 Score: 0.6054476160401321
   {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200, 'reg_lambda': 10}
  Train F1 Score: 0.6072440190062119 Validation F1 Score: 0.6054665547136356
   {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50, 'reg_lambda': 1}
  Train F1 Score: 0.4669446709494083 Validation F1 Score: 0.46777383170123316
   {'gamma': 0, 'le