In [137]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score, accuracy_score
from sklearn.metrics import roc_curve
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


In [111]:

loan_train = pd.read_csv("data/loan_outcomes_train.csv")
loan_predict = pd.read_csv("data/loan_outcomes_predict.csv")
gps = pd.read_csv("data/gps.csv")
events = pd.read_csv("data/events.csv")
features = pd.read_csv("data/features.csv")

loan_train['application_at'] = pd.to_datetime(loan_train['application_at'])
loan_predict['application_at'] = pd.to_datetime(loan_predict['application_at'])
gps['time_of_fix'] = pd.to_datetime(gps['time_of_fix'])
events['timestamp'] = pd.to_datetime(events['timestamp'])


Train Data prepration

In [112]:

train_base = loan_train.merge(features, on="user_id", how="left")

gps_merged = gps.merge(
    loan_train[['user_id', 'application_at']],
    on='user_id',
    how='inner'
)


In [113]:
gps_merged = gps_merged[gps_merged['time_of_fix'] <= gps_merged['application_at']]

gps_features = gps_merged.groupby(['user_id', 'application_at']).agg(
    gps_count=('id', 'count'),
    avg_accuracy=('accuracy', 'mean'),
    std_accuracy=('accuracy', 'std'),
    avg_speed=('land_speed', 'mean'),
    max_speed=('land_speed', 'max'),
    providers=('location_provider', 'nunique'),
    last_gps_time=('time_of_fix', 'max')
).reset_index()

train_df = train_base.merge(gps_features, on=['user_id', 'application_at'], how='left')

train_df['gps_recency_hours'] = (
    (train_df['application_at'] - train_df['last_gps_time']).dt.total_seconds() / 3600
)


In [114]:
events_merged = events.merge(
    loan_train[['user_id', 'application_at']],
    on='user_id',
    how='inner'
)

events_merged = events_merged[events_merged['timestamp'] <= events_merged['application_at']]

event_features = events_merged.groupby(['user_id', 'application_at']).agg(
    total_events=('id', 'count'),
    unique_screens=('screen_name', 'nunique'),
    unique_actions=('action', 'nunique'),
    total_sessions=('session_id', 'nunique'),
    unique_networks=('network_type', 'nunique'),
    last_event_time=('timestamp', 'max')
).reset_index()

train_df = train_df.merge(event_features, on=['user_id', 'application_at'], how='left')

train_df['event_recency_hours'] = (
    (train_df['application_at'] - train_df['last_event_time']).dt.total_seconds() / 3600
)

train_df.drop(columns=['last_gps_time', 'last_event_time'], inplace=True)

print("Final TRAIN DF")


Final TRAIN DF


In [115]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8089 entries, 0 to 8088
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   user_id              8089 non-null   int64         
 1   application_at       8089 non-null   datetime64[ns]
 2   is_repaid            8089 non-null   bool          
 3   feature_1            6872 non-null   float64       
 4   feature_2            7215 non-null   float64       
 5   feature_3            3003 non-null   float64       
 6   feature_4            4738 non-null   float64       
 7   feature_5            8089 non-null   float64       
 8   feature_6            8089 non-null   float64       
 9   feature_7            5813 non-null   float64       
 10  feature_8            2237 non-null   float64       
 11  feature_9            8089 non-null   float64       
 12  feature_10           8089 non-null   float64       
 13  gps_count            6664 non-nul

In [116]:
train_df.to_csv("data/train_dataframe.csv", index=False)

In [117]:

predict_base = loan_predict.merge(features, on="user_id", how="left")

gps_predict_merged = gps.merge(
    loan_predict[['user_id', 'application_at']],
    on="user_id",
    how="inner"
)


In [118]:

gps_predict_merged = gps_predict_merged[
    gps_predict_merged['time_of_fix'] <= gps_predict_merged['application_at']
]

gps_predict_features = gps_predict_merged.groupby(
    ['user_id', 'application_at']
).agg(
    gps_count=('id', 'count'),
    avg_accuracy=('accuracy', 'mean'),
    std_accuracy=('accuracy', 'std'),
    avg_speed=('land_speed', 'mean'),
    max_speed=('land_speed', 'max'),
    providers=('location_provider', 'nunique'),
    last_gps_time=('time_of_fix', 'max')
).reset_index()

predict_df = predict_base.merge(
    gps_predict_features,
    on=['user_id','application_at'],
    how='left'
)

predict_df['gps_recency_hours'] = (
    (predict_df['application_at'] - predict_df['last_gps_time']).dt.total_seconds() / 3600
)


In [119]:
events_predict_merged = events.merge(
    loan_predict[['user_id','application_at']],
    on='user_id',
    how='inner'
)

events_predict_merged = events_predict_merged[
    events_predict_merged['timestamp'] <= events_predict_merged['application_at']
]

event_predict_features = events_predict_merged.groupby(
    ['user_id','application_at']
).agg(
    total_events=('id','count'),
    unique_screens=('screen_name','nunique'),
    unique_actions=('action','nunique'),
    total_sessions=('session_id','nunique'),
    unique_networks=('network_type','nunique'),
    last_event_time=('timestamp','max')
).reset_index()

predict_df = predict_df.merge(
    event_predict_features,
    on=['user_id','application_at'],
    how='left'
)

predict_df['event_recency_hours'] = (
    (predict_df['application_at'] - predict_df['last_event_time']).dt.total_seconds() / 3600
)


In [120]:
predict_df.drop(columns=['last_gps_time','last_event_time'], inplace=True)


In [121]:
predict_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022 entries, 0 to 2021
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   user_id              2022 non-null   int64         
 1   application_at       2022 non-null   datetime64[ns]
 2   feature_1            1728 non-null   float64       
 3   feature_2            1806 non-null   float64       
 4   feature_3            782 non-null    float64       
 5   feature_4            1222 non-null   float64       
 6   feature_5            2022 non-null   float64       
 7   feature_6            2022 non-null   float64       
 8   feature_7            1446 non-null   float64       
 9   feature_8            581 non-null    float64       
 10  feature_9            2022 non-null   float64       
 11  feature_10           2022 non-null   float64       
 12  gps_count            1659 non-null   float64       
 13  avg_accuracy         1659 non-nul

In [122]:
predict_df.to_csv("data/predict_dataframe.csv", index=False)

In [124]:
train_df.describe()

Unnamed: 0,user_id,application_at,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,avg_speed,max_speed,providers,gps_recency_hours,total_events,unique_screens,unique_actions,total_sessions,unique_networks,event_recency_hours
count,8089.0,8089,6872.0,7215.0,3003.0,4738.0,8089.0,8089.0,5813.0,2237.0,...,6664.0,6664.0,6664.0,6664.0,8084.0,8084.0,8084.0,8084.0,8084.0,8084.0
mean,5065.095191,2023-06-05 14:17:30.527881728,1628.870328,714.810023,112.352647,308.280831,0.108666,-0.222318,5109.482914,19.852928,...,0.115755,0.546539,1.02596,2589.394433,622.688273,29.930727,15.285007,8.439881,1.669099,96.624686
min,2.0,2022-02-12 09:28:15.808258,1.0,1.0,0.0,4.6e-05,0.0,-1.0,-3181.041426,0.0,...,-1.0,-1.0,1.0,8.6e-05,14.0,2.0,2.0,0.0,0.0,3e-06
25%,2547.0,2022-07-06 11:40:13.922177024,305.105589,109.993071,7.0,30.041577,0.0,-0.866025,5090.139722,1.0,...,0.0,0.0,1.0,0.007019,131.0,15.0,11.0,0.0,1.0,0.002235
50%,5063.0,2023-03-22 13:29:48.928602880,687.522101,244.936,63.0,71.074774,0.0,-0.258819,5104.523853,3.0,...,0.0,0.0,1.0,0.021172,343.0,28.0,14.0,1.0,1.0,0.006998
75%,7597.0,2024-05-31 11:50:21.454672896,1641.635152,591.80621,196.0,347.459259,0.0,0.258819,5205.80767,12.0,...,0.028199,0.18585,1.0,118.296552,725.0,39.0,18.0,8.0,2.0,0.019157
max,10111.0,2025-03-11 11:01:08.000959,69405.956522,88816.932708,364.0,3390.004213,37.0,1.0,5374.024262,3284.0,...,10.425,79.085014,3.0,70697.915786,18936.0,139.0,46.0,435.0,4.0,49030.438323
std,2920.765524,,3175.208829,2598.58064,121.065759,497.839505,1.173156,0.655284,197.282723,99.147346,...,0.473475,2.085554,0.173473,8025.946367,994.424245,17.883146,5.648048,19.913123,0.793067,1390.404137


In [127]:
train_df.isna().mean()*100

user_id                 0.000000
application_at          0.000000
is_repaid               0.000000
feature_1              15.045123
feature_2              10.804797
feature_3              62.875510
feature_4              41.426629
feature_5               0.000000
feature_6               0.000000
feature_7              28.136976
feature_8              72.345160
feature_9               0.000000
feature_10              0.000000
gps_count              17.616516
avg_accuracy           17.616516
std_accuracy           40.561256
avg_speed              17.616516
max_speed              17.616516
providers              17.616516
gps_recency_hours      17.616516
total_events            0.061812
unique_screens          0.061812
unique_actions          0.061812
total_sessions          0.061812
unique_networks         0.061812
event_recency_hours     0.061812
dtype: float64

In [129]:
numeric_cols = train_df.select_dtypes(include=['float64','int64']).columns
numeric_cols = [c for c in numeric_cols if c not in ['user_id']]  # exclude ids

imputer = SimpleImputer(strategy='median')

train_df[numeric_cols] = imputer.fit_transform(train_df[numeric_cols])
predict_df[numeric_cols] = imputer.transform(predict_df[numeric_cols])

In [132]:
train_df.isna().sum()


user_id                0
application_at         0
is_repaid              0
feature_1              0
feature_2              0
feature_3              0
feature_4              0
feature_5              0
feature_6              0
feature_7              0
feature_8              0
feature_9              0
feature_10             0
gps_count              0
avg_accuracy           0
std_accuracy           0
avg_speed              0
max_speed              0
providers              0
gps_recency_hours      0
total_events           0
unique_screens         0
unique_actions         0
total_sessions         0
unique_networks        0
event_recency_hours    0
dtype: int64

In [133]:
y = train_df['is_repaid'].astype(int)
feature_cols = [
    c for c in train_df.columns 
    if c not in ['is_repaid','user_id','application_at']
]

X = train_df[feature_cols]
X_predict = predict_df[feature_cols]

In [135]:
X

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,avg_speed,max_speed,providers,gps_recency_hours,total_events,unique_screens,unique_actions,total_sessions,unique_networks,event_recency_hours
0,687.294202,511.519160,208.0,39.886493,0.0,2.588190e-01,4902.948969,3.0,1.0,1.0,...,0.000000,0.000000,1.0,0.005291,295.0,30.0,20.0,13.0,1.0,0.002390
1,332.567895,174.139697,18.0,71.074774,0.0,-9.659258e-01,5251.949206,3.0,0.0,1.0,...,0.000000,0.000000,1.0,174.477750,284.0,22.0,13.0,6.0,1.0,0.037166
2,3051.181654,607.714490,63.0,71.074774,0.0,5.000000e-01,5104.523853,3.0,0.0,1.0,...,0.000000,0.000000,1.0,0.021172,41.0,5.0,7.0,0.0,1.0,0.021543
3,200.000000,54.307692,63.0,201.663889,0.0,1.000000e+00,5205.837996,3.0,0.0,1.0,...,0.000000,0.000000,1.0,0.037499,338.0,19.0,12.0,0.0,1.0,0.036388
4,140.000000,76.333333,90.0,71.074774,0.0,-1.000000e+00,4910.088068,3.0,1.0,1.0,...,0.000000,0.000000,1.0,0.066980,45.0,11.0,10.0,1.0,1.0,0.049540
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8084,2520.117196,932.070922,30.0,914.773657,0.0,7.071068e-01,5154.025318,3.0,3.0,1.0,...,0.000000,0.000000,1.0,0.015568,272.0,31.0,21.0,1.0,1.0,0.003918
8085,952.717391,95.222930,28.0,30.504664,0.0,5.000000e-01,5098.635909,74.0,1.0,0.0,...,0.000000,0.000000,1.0,0.006284,525.0,37.0,17.0,18.0,2.0,0.001358
8086,815.384615,172.350877,237.0,82.636181,0.0,7.071068e-01,5098.560476,3.0,0.0,1.0,...,0.000000,0.000000,1.0,0.062031,1144.0,45.0,16.0,15.0,2.0,0.022046
8087,747.773125,330.502984,80.0,71.074774,0.0,-2.588190e-01,5104.523853,3.0,0.0,0.0,...,0.000000,0.000000,1.0,0.021172,90.0,13.0,16.0,2.0,2.0,0.000170


In [138]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
def evaluate_model(name, model, X_tr, y_tr, X_v, y_v):
    model.fit(X_tr, y_tr)
    preds = model.predict_proba(X_v)[:,1]
    pred_labels = model.predict(X_v)

    roc = roc_auc_score(y_v, preds)

    precision, recall, _ = precision_recall_curve(y_v, preds)
    pr_auc = auc(recall, precision)

    f1 = f1_score(y_v, pred_labels)
    acc = accuracy_score(y_v, pred_labels)

    print(f"\n===== {name} =====")
    print(f"ROC-AUC: {roc:.4f}")
    print(f"PR-AUC:  {pr_auc:.4f}")
    print(f"F1:      {f1:.4f}")
    print(f"Accuracy:{acc:.4f}")

    fpr, tpr, _ = roc_curve(y_v, preds)
    return roc, (fpr, tpr), model


In [None]:
numeric_cols = X_train.columns

scaler = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols)
    ],
    remainder="passthrough"
)

models = {}

models["Logistic Regression"] = Pipeline([
    ("scale", scaler),
    ("clf", LogisticRegression(max_iter=200))
])

models["Random Forest"] = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

models["Gradient Boosting"] = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300
)


models["XGBoost"] = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1
    )


In [None]:
results = []
roc_curves = {}

for name, model in models.items():
    roc, curve, fitted = evaluate_model(name, model, X_train, y_train, X_val, y_val)
    results.append((name, roc, fitted))
    roc_curves[name] = curve
