# Import

In [17]:
import pandas as pd
from sklearn.metrics import average_precision_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Data Preprocessing

In [18]:
train = pd.read_csv('train.csv')
loan_activities = pd.read_csv('loan_activities.csv')
test = pd.read_csv('test.csv')

In [19]:
train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,847042
1,10857


In [20]:
non_fraud = train[train.label == 0]
fraud = train[train.label == 1]
print(non_fraud.shape)
print(fraud.shape)

(847042, 19)
(10857, 19)


In [21]:
train.groupby('label').mean()

Unnamed: 0_level_0,user_id,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1851083.0,0.746346,3.706804,0.558243,0.402963,0.181864,0.325565,0.979006,-0.239828,0.932846,0.258683,1.345989,-0.278029,-0.354848,-0.347332,0.037133,-0.552359,-0.437668
1,1863080.0,0.787787,3.884498,-0.459439,-0.485601,-0.355211,-0.481173,-0.409667,-0.661239,-0.417288,-0.325537,0.556876,-0.652158,-0.681832,-0.664929,-0.475042,-0.778923,-0.698378


In [22]:
non_fraud_sample = non_fraud.sample(n=10857)
new_dataset = pd.concat([non_fraud_sample, fraud], axis=0)
new_dataset['label'].value_counts()
new_dataset.groupby('label').mean()

Unnamed: 0_level_0,user_id,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1856158.0,0.747628,3.738049,0.578482,0.419986,0.181315,0.347076,1.000453,-0.237514,0.953339,0.259038,1.3734,-0.275079,-0.352802,-0.345543,0.041,-0.548827,-0.43298
1,1863080.0,0.787787,3.884498,-0.459439,-0.485601,-0.355211,-0.481173,-0.409667,-0.661239,-0.417288,-0.325537,0.556876,-0.652158,-0.681832,-0.664929,-0.475042,-0.778923,-0.698378


In [23]:
train_merged = pd.merge(new_dataset, loan_activities, on='user_id', how='left')
train_merged['loan_count'] = train_merged.groupby('user_id')['loan_type'].transform('count')
train_merged['loan_recent'] = train_merged.groupby('user_id')['ts'].transform('max')
train_merged = train_merged.fillna(0)

X = train_merged.drop(columns=['user_id', 'label', 'loan_type', 'ts', 'reference_contact'])
y = train_merged['label']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=42)

# Modelling

In [24]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [25]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy on Training data : ', training_data_accuracy)

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy on Training data :  0.7611105094046716
Accuracy score on Test Data :  0.7587003610108303


In [26]:
log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

In [27]:
y_scores = log_reg.predict_proba(X_test)[:, 1]

ap_score = average_precision_score(y_test, y_scores)
print(f"Average Precision (AP): {ap_score}")

y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))

Average Precision (AP): 0.7060397353025358
              precision    recall  f1-score   support

           0       0.85      0.72      0.78      4190
           1       0.66      0.81      0.72      2735

    accuracy                           0.76      6925
   macro avg       0.75      0.77      0.75      6925
weighted avg       0.77      0.76      0.76      6925



In [28]:
test_merged = pd.merge(test, loan_activities, on='user_id', how='left')
test_merged['loan_count'] = test_merged.groupby('user_id')['loan_type'].transform('count')
test_merged['loan_recent'] = test_merged.groupby('user_id')['ts'].transform('max')
test_merged = test_merged.fillna(0)

In [29]:
X_test_final = test_merged.drop(columns=['user_id', 'loan_type', 'ts', 'reference_contact'])
X_test_final = scaler.transform(X_test_final)
test_predictions = log_reg.predict(X_test_final)

# Submission

In [30]:
submission = pd.DataFrame({'user_id': test_merged['user_id'], 'label': test_predictions})
submission.to_csv('submission.csv', index=False)