In [6]:
# Imports
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import warnings
warnings.filterwarnings("ignore")

print("Libraries imported!")

Libraries imported!


In [7]:
train = pd.read_csv("train_updated.csv")
test = pd.read_csv("test_updated.csv")

TARGET = "RiskFlag"
IDCOL = "ProfileID"

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()

Train shape: (204277, 18)
Test shape: (51070, 17)


Unnamed: 0,ProfileID,ApplicantYears,AnnualEarnings,RequestedSum,TrustMetric,WorkDuration,ActiveAccounts,OfferRate,RepayPeriod,DebtFactor,QualificationLevel,WorkCategory,RelationshipStatus,OwnsProperty,FamilyObligation,FundUseCase,JointApplicant,RiskFlag
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204277 entries, 0 to 204276
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ProfileID           204277 non-null  object 
 1   ApplicantYears      204277 non-null  int64  
 2   AnnualEarnings      204277 non-null  int64  
 3   RequestedSum        204277 non-null  int64  
 4   TrustMetric         204277 non-null  int64  
 5   WorkDuration        204277 non-null  int64  
 6   ActiveAccounts      204277 non-null  int64  
 7   OfferRate           204277 non-null  float64
 8   RepayPeriod         204277 non-null  int64  
 9   DebtFactor          204277 non-null  float64
 10  QualificationLevel  204277 non-null  object 
 11  WorkCategory        204277 non-null  object 
 12  RelationshipStatus  204277 non-null  object 
 13  OwnsProperty        204277 non-null  object 
 14  FamilyObligation    204277 non-null  object 
 15  FundUseCase         204277 non-nul

In [9]:
train.isnull().sum()

Unnamed: 0,0
ProfileID,0
ApplicantYears,0
AnnualEarnings,0
RequestedSum,0
TrustMetric,0
WorkDuration,0
ActiveAccounts,0
OfferRate,0
RepayPeriod,0
DebtFactor,0


In [10]:
X = train.drop(columns=[TARGET])
y = train[TARGET].astype(int)
test_ids = test[IDCOL]

# ---- AUTO DETECT NUMERIC vs CATEGORICAL ----
numeric_cols = []
categorical_cols = []

for col in X.columns:
    if col == IDCOL:
        continue

    converted = pd.to_numeric(X[col], errors="coerce")
    if converted.notnull().mean() >= 0.8:
        numeric_cols.append(col)
    else:
        categorical_cols.append(col)

# detect binary-like columns
binary_cols = []
final_categorical = []

for col in categorical_cols:
    unique_vals = set([str(v).lower() for v in X[col].dropna().unique()])
    if len(unique_vals) <= 3 and (unique_vals <= {"yes", "no"} or unique_vals <= {"0", "1"}):
        binary_cols.append(col)
    else:
        final_categorical.append(col)

categorical_cols = final_categorical

print("Numeric:", numeric_cols)
print("Binary:", binary_cols)
print("Categorical:", categorical_cols)

Numeric: ['ApplicantYears', 'AnnualEarnings', 'RequestedSum', 'TrustMetric', 'WorkDuration', 'ActiveAccounts', 'OfferRate', 'RepayPeriod', 'DebtFactor']
Binary: ['OwnsProperty', 'FamilyObligation', 'JointApplicant']
Categorical: ['QualificationLevel', 'WorkCategory', 'RelationshipStatus', 'FundUseCase']


In [11]:
def map_bin(v):
    if pd.isna(v): return 0
    s = str(v).strip().lower()
    if s in {"yes","1","y","true"}: return 1
    if s in {"no","0","n","false"}: return 0
    try:
        return 1 if float(s) != 0 else 0
    except:
        return 0

X_clean = X.copy()
test_clean = test.copy()

# Clean numeric
for c in numeric_cols:
    X_clean[c] = pd.to_numeric(X_clean[c], errors="coerce").fillna(X_clean[c].median())
    test_clean[c] = pd.to_numeric(test_clean[c], errors="coerce").fillna(X_clean[c].median())

# Clean binary
for c in binary_cols:
    X_clean[c] = X_clean[c].apply(map_bin).astype(int)
    test_clean[c] = test_clean[c].apply(map_bin).astype(int)

# Clean categorical
for c in categorical_cols:
    X_clean[c] = X_clean[c].fillna("Unknown").astype(str)
    test_clean[c] = test_clean[c].fillna("Unknown").astype(str)

feature_cols = numeric_cols + binary_cols + categorical_cols

print("Using feature columns:", feature_cols)

Using feature columns: ['ApplicantYears', 'AnnualEarnings', 'RequestedSum', 'TrustMetric', 'WorkDuration', 'ActiveAccounts', 'OfferRate', 'RepayPeriod', 'DebtFactor', 'OwnsProperty', 'FamilyObligation', 'JointApplicant', 'QualificationLevel', 'WorkCategory', 'RelationshipStatus', 'FundUseCase']


In [12]:
X_used = X_clean[feature_cols]
X_test_used = test_clean[feature_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("num_scale", StandardScaler(), numeric_cols + binary_cols),
        ("cat_ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ]
)

preprocessor.fit(X_used)

X_proc = preprocessor.transform(X_used)
X_test_proc = preprocessor.transform(X_test_used)

print("Processed X shape:", X_proc.shape)
print("Processed test shape:", X_test_proc.shape)

Processed X shape: (204277, 28)
Processed test shape: (51070, 28)


In [13]:
X_train_full, X_val_full, y_train_full, y_val_full = train_test_split(
    X_proc, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", X_train_full.shape)
print("Val size:", X_val_full.shape)

Train size: (163421, 28)
Val size: (40856, 28)


In [14]:
X_train_full.shape

(163421, 28)

In [15]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

svm_base = LinearSVC(class_weight="balanced", max_iter=5000)
svm_clf = CalibratedClassifierCV(svm_base, cv=3, method="sigmoid")

svm_clf.fit(X_train_full, y_train_full)

svm_val_prob = svm_clf.predict_proba(X_val_full)[:, 1]
svm_val_pred = (svm_val_prob >= 0.5).astype(int)

def metrics(y_true, y_pred, y_prob):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_prob)
    }

svm_full_metrics = metrics(y_val_full, svm_val_pred, svm_val_prob)
svm_full_metrics

{'accuracy': 0.8846436263951439,
 'precision': 0.5673758865248227,
 'recall': 0.033677120606188174,
 'f1': 0.06358036956089808,
 'roc_auc': np.float64(0.7465813813758143)}

In [16]:
mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    max_iter=300,
    early_stopping=True,
    random_state=42
)

mlp.fit(X_train_full, y_train_full)

mlp_val_prob = mlp.predict_proba(X_val_full)[:, 1]
mlp_val_pred = (mlp_val_prob >= 0.5).astype(int)

mlp_full_metrics = metrics(y_val_full, mlp_val_pred, mlp_val_prob)
mlp_full_metrics

{'accuracy': 0.8855981985510084,
 'precision': 0.6037735849056604,
 'recall': 0.04714796884866344,
 'f1': 0.08746583365872707,
 'roc_auc': np.float64(0.750382113302862)}

In [17]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs',
    random_state=42
)

# Train
log_reg.fit(X_train_full, y_train_full)

# Predict probabilities on validation set
log_reg_val_prob = log_reg.predict_proba(X_val_full)[:, 1]

# Convert probabilities → class predictions
log_reg_val_pred = (log_reg_val_prob >= 0.5).astype(int)

# Evaluate using same metrics function
log_reg_full_metrics = metrics(y_val_full, log_reg_val_pred, log_reg_val_prob)

log_reg_full_metrics

{'accuracy': 0.6712845114548659,
 'precision': 0.2158339335996333,
 'recall': 0.6937486844874763,
 'f1': 0.3292378383777844,
 'roc_auc': np.float64(0.7465504780354989)}

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Logistic Regression model
lr_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('classifier', LogisticRegression(max_iter = 1000, class_weight = 'balanced'))
])

cv = StratifiedKFold(n_splits = 5)
grid_search_lr = GridSearchCV(
    lr_pipeline,
    param_grid = {},
    cv = cv,
    scoring = 'roc_auc',
    n_jobs = -1
)
grid_search_lr.fit(X_train_full, y_train_full)

log_reg_val_prob1 = grid_search_lr.predict_proba(X_val_full)[:, 1]

# Convert probabilities → class predictions
log_reg_val_pred1 = (log_reg_val_prob >= 0.5).astype(int)

# Evaluate using same metrics function
log_reg_full_metrics1 = metrics(y_val_full, log_reg_val_pred1, log_reg_val_prob1)

log_reg_full_metrics1

{'accuracy': 0.6712845114548659,
 'precision': 0.2158339335996333,
 'recall': 0.6937486844874763,
 'f1': 0.3292378383777844,
 'roc_auc': np.float64(0.7465504780354989)}

In [19]:
# --------------------------------------
# Logistic Regression Boosting (MLP-style)
# --------------------------------------

from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

# Base logistic regression
log_reg_boost_base = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs',
    random_state=42
)

# AdaBoost with Logistic Regression base model
lr_boost = AdaBoostClassifier(
    estimator=log_reg_boost_base,
    n_estimators=50,
    learning_rate=0.5,
    random_state=42
)

# Train
lr_boost.fit(X_train_full, y_train_full)

# Predict probabilities on validation set
lr_boost_val_prob = lr_boost.predict_proba(X_val_full)[:, 1]

# Convert probabilities → class predictions
lr_boost_val_pred = (lr_boost_val_prob >= 0.5).astype(int)

# Evaluate using same metrics function
lr_boost_full_metrics = metrics(y_val_full, lr_boost_val_pred, lr_boost_val_prob)

lr_boost_full_metrics


{'accuracy': 0.6779175641276679,
 'precision': 0.21789021607837875,
 'recall': 0.6834350663018312,
 'f1': 0.33043301277158704,
 'roc_auc': np.float64(0.6883615373680178)}

In [20]:
!pip install -q lightgbm xgboost catboost
from catboost import CatBoostClassifier
cat = CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=False
    )


# Train
cat.fit(X_train_full, y_train_full)

# Predict probabilities on validation set
cat_val_prob = cat.predict_proba(X_val_full)[:, 1]

# Convert probabilities → class predictions
cat_val_pred = (cat_val_prob >= 0.5).astype(int)

# Evaluate using same metrics function
cat_full_metrics = metrics(y_val_full, cat_val_pred, cat_val_prob)

cat_full_metrics

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

{'accuracy': 0.886087722733503,
 'precision': 0.5846422338568935,
 'recall': 0.07051147126920648,
 'f1': 0.1258452291510143,
 'roc_auc': np.float64(0.7549132448912496)}

In [21]:
from xgboost import XGBClassifier

xg = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        n_jobs=-1
    )


# Train
xg.fit(X_train_full, y_train_full)

# Predict probabilities on validation set
xg_val_prob = xg.predict_proba(X_val_full)[:, 1]

# Convert probabilities → class predictions
xg_val_pred = (xg_val_prob >= 0.5).astype(int)

# Evaluate using same metrics function
xg_full_metrics = metrics(y_val_full, xg_val_pred, xg_val_prob)

xg_full_metrics

{'accuracy': 0.8857450558057568,
 'precision': 0.5690515806988353,
 'recall': 0.07198484529572721,
 'f1': 0.12780269058295965,
 'roc_auc': np.float64(0.7517052350672404)}

20% of test size

In [22]:
_, X_small, _, y_small = train_test_split(
    X_proc, y, test_size=0.2, stratify=y, random_state=42
)

X_train_small, X_val_small, y_train_small, y_val_small = train_test_split(
    X_small, y_small, test_size=0.2, stratify=y_small, random_state=42
)

# SVM small
svm_small_base = LinearSVC(class_weight="balanced", max_iter=5000)
svm_small = CalibratedClassifierCV(svm_small_base, cv=3, method="sigmoid")
svm_small.fit(X_train_small, y_train_small)

svm_small_prob = svm_small.predict_proba(X_val_small)[:,1]
svm_small_pred = (svm_small_prob >= 0.5).astype(int)

svm_small_metrics = metrics(y_val_small, svm_small_pred, svm_small_prob)

# NN small
mlp_small = MLPClassifier(
    hidden_layer_sizes=(128,64), activation='relu',
    solver='adam', early_stopping=True, max_iter=300, random_state=42
)
mlp_small.fit(X_train_small, y_train_small)

mlp_small_prob = mlp_small.predict_proba(X_val_small)[:,1]
mlp_small_pred = (mlp_small_prob >= 0.5).astype(int)

mlp_small_metrics = metrics(y_val_small, mlp_small_pred, mlp_small_prob)

svm_small_metrics, mlp_small_metrics

({'accuracy': 0.8847283406754772,
  'precision': 0.5909090909090909,
  'recall': 0.02736842105263158,
  'f1': 0.052313883299798795,
  'roc_auc': np.float64(0.7460693203515576)},
 {'accuracy': 0.8846059716103769,
  'precision': 0.5853658536585366,
  'recall': 0.02526315789473684,
  'f1': 0.04843592330978809,
  'roc_auc': np.float64(0.7427056217114373)})

In [23]:
comparison = pd.DataFrame([
    {"Model": "SVM", "Dataset": "Full", "Metrics": svm_full_metrics},
    {"Model": "NN",  "Dataset": "Full", "Metrics": mlp_full_metrics},
    {"Model": "SVM", "Dataset": "20%",  "Metrics": svm_small_metrics},
    {"Model": "NN",  "Dataset": "20%",  "Metrics": mlp_small_metrics},
])

comparison

Unnamed: 0,Model,Dataset,Metrics
0,SVM,Full,"{'accuracy': 0.8846436263951439, 'precision': ..."
1,NN,Full,"{'accuracy': 0.8855981985510084, 'precision': ..."
2,SVM,20%,"{'accuracy': 0.8847283406754772, 'precision': ..."
3,NN,20%,"{'accuracy': 0.8846059716103769, 'precision': ..."


In [24]:
def roc(m): return m["roc_auc"]
best_row = comparison.iloc[comparison["Metrics"].apply(roc).idxmax()]
best_row

Unnamed: 0,1
Model,NN
Dataset,Full
Metrics,"{'accuracy': 0.8855981985510084, 'precision': ..."


In [29]:
best_model = None

if best_row["Model"] == "SVM" and best_row["Dataset"] == "Full":
    best_model = svm_clf
elif best_row["Model"] == "NN" and best_row["Dataset"] == "Full":
    best_model = mlp
elif best_row["Model"] == "SVM":
    best_model = svm_small
else:
    best_model = mlp_small

test_probs = best_model.predict_proba(X_test_proc)[:,1]
test_preds = (test_probs >= 0.5).astype(int)

submission = pd.DataFrame({
    IDCOL: test_ids,
    TARGET: test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,ProfileID,RiskFlag
0,CKV34LU7V7,0
1,62KTYNH93J,0
2,JGFUSOIUH7,0
3,4538THBHOX,0
4,DXLNA06JHR,0


Files for other models

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd

# use only faster models
models = {
    'Logistic Regression': LogisticRegression(max_iter=500),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(
        tree_method='hist',          # ✅ use CPU mode instead of gpu_hist
        predictor='cpu_predictor',   # ✅ ensure CPU prediction
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    'CatBoost': CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=False
    )
}

results = []
for name, model in models.items():
    model.fit(X_train_full, y_train_full)
    y_pred = model.predict(X_val_full)
    y_prob = model.predict_proba(X_val_full)[:, 1]

    acc = accuracy_score(y_val_full, y_pred)
    prec = precision_score(y_val_full, y_pred)
    rec = recall_score(y_val_full, y_pred)
    f1 = f1_score(y_val_full, y_pred)
    roc = roc_auc_score(y_val_full, y_prob)

    tn, fp, fn, tp = confusion_matrix(y_val_full, y_pred).ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    results.append([name, acc, prec, rec, f1, roc, specificity, sensitivity])

results_df = pd.DataFrame(results, columns=[
    'Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score',
    'ROC-AUC', 'Specificity', 'Sensitivity'
])

print("Model Evaluation Results:\n")
display(results_df.sort_values(by='Accuracy', ascending=False))

Model Evaluation Results:



Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC-AUC,Specificity,Sensitivity
5,CatBoost,0.886088,0.584642,0.070511,0.125845,0.754913,0.993408,0.070511
2,Random Forest,0.885549,0.638376,0.036413,0.068897,0.728878,0.997286,0.036413
0,Logistic Regression,0.88479,0.574324,0.035782,0.067367,0.746417,0.99651,0.035782
3,Naive Bayes,0.884619,0.556575,0.038308,0.071682,0.740961,0.995984,0.038308
4,XGBoost,0.884228,0.512947,0.087561,0.149586,0.739271,0.98906,0.087561
1,Decision Tree,0.803456,0.204008,0.237845,0.219631,0.557864,0.877884,0.237845


In [31]:
# Dictionary of models (same as you used earlier)
models = {
    'LogisticRegression': LogisticRegression(max_iter=500),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'NaiveBayes': GaussianNB(),
    'XGBoost': XGBClassifier(
        tree_method='hist',
        predictor='cpu_predictor',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    'CatBoost': CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        verbose=False
    )
}

print("\nGenerating submissions for all models...\n")

for name, model in models.items():
    print(f"Training {name}...")

    # Train model
    model.fit(X_train_full, y_train_full)

    # Predict probabilities on test data
    test_probs = model.predict_proba(X_test_proc)[:, 1]

    # Convert to class labels
    test_preds = (test_probs >= 0.5).astype(int)

    # Create submission df
    submission = pd.DataFrame({
        IDCOL: test_ids,
        TARGET: test_preds
    })

    # Save CSV file
    filename = f"submission_{name}.csv"
    submission.to_csv(filename, index=False)

    print(f"Saved: {filename}")
    print(submission.head(), "\n")

print("✔ All submissions generated!")



Generating submissions for all models...

Training LogisticRegression...
Saved: submission_LogisticRegression.csv
    ProfileID  RiskFlag
0  CKV34LU7V7         0
1  62KTYNH93J         0
2  JGFUSOIUH7         0
3  4538THBHOX         0
4  DXLNA06JHR         0 

Training DecisionTree...
Saved: submission_DecisionTree.csv
    ProfileID  RiskFlag
0  CKV34LU7V7         0
1  62KTYNH93J         0
2  JGFUSOIUH7         0
3  4538THBHOX         0
4  DXLNA06JHR         1 

Training RandomForest...
Saved: submission_RandomForest.csv
    ProfileID  RiskFlag
0  CKV34LU7V7         0
1  62KTYNH93J         0
2  JGFUSOIUH7         0
3  4538THBHOX         0
4  DXLNA06JHR         0 

Training NaiveBayes...
Saved: submission_NaiveBayes.csv
    ProfileID  RiskFlag
0  CKV34LU7V7         0
1  62KTYNH93J         0
2  JGFUSOIUH7         0
3  4538THBHOX         0
4  DXLNA06JHR         0 

Training XGBoost...
Saved: submission_XGBoost.csv
    ProfileID  RiskFlag
0  CKV34LU7V7         0
1  62KTYNH93J         0
2  