task_4 modelling

import libraries

In [6]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from xverse.transformer import WOE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [2]:
def load_data(file_path):
    return pd.read_csv(file_path)

# Load data
file_path = '../data/data2.csv'
data = load_data(file_path)


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  int64  
 1   BatchId               95662 non-null  int64  
 2   AccountId             95662 non-null  int64  
 3   SubscriptionId        95662 non-null  int64  
 4   CustomerId            95662 non-null  int64  
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  int64  
 8   ProductId             95662 non-null  int64  
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  int64  
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  FraudResult        

In [5]:
# Encode Categorical Variables: One-Hot Encoding
def one_hot_encode(data, categorical_features):
    data = pd.get_dummies(data, columns=categorical_features)
    return data

categorical_features = data.select_dtypes(include=['object']).columns.tolist()
data_one_hot_encoded = one_hot_encode(data, categorical_features)
print("One-Hot Encoded Data:")
print(data_one_hot_encoded.head())

One-Hot Encoded Data:
   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId  CountryCode  \
0          76871    36123       3957             887        4406          256   
1          73770    15642       4841            3829        4406          256   
2          26203    53941       4229             222        4683          256   
3            380   102363        648            2185         988          256   
4          28195    38780       4841            3829         988          256   

   ProviderId  ProductId  ChannelId   Amount  ...  \
0           6         10          3   1000.0  ...   
1           4          6          2    -20.0  ...   
2           6          1          3    500.0  ...   
3           1         21          3  20000.0  ...   
4           4          6          2   -644.0  ...   

   TransactionStartTime_2019-02-13T09:50:20Z  \
0                                      False   
1                                      False   
2                           

In [7]:
# Encode Categorical Variables: Label Encoding
# Encode Categorical Variables: Label Encoding
def label_encode(data, categorical_features):
    le = LabelEncoder()
    for feature in categorical_features:
        data[feature] = le.fit_transform(data[feature].astype(str))
    return data

data_label_encoded = label_encode(data, categorical_features)
print("Label Encoded Data:")
print(data_label_encoded.head())

Label Encoded Data:
   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId  \
0          76871    36123       3957             887        4406   
1          73770    15642       4841            3829        4406   
2          26203    53941       4229             222        4683   
3            380   102363        648            2185         988   
4          28195    38780       4841            3829         988   

   CurrencyCode  CountryCode  ProviderId  ProductId  ProductCategory  \
0             0          256           6         10                0   
1             0          256           4          6                2   
2             0          256           6          1                0   
3             0          256           1         21                8   
4             0          256           4          6                2   

   ChannelId   Amount  Value  TransactionStartTime  PricingStrategy  \
0          3   1000.0   1000                     0                2

In [8]:
def handle_missing_values(data, strategy='mean'):
    numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
    imputer = SimpleImputer(strategy=strategy)
    data[numerical_features] = imputer.fit_transform(data[numerical_features])
    return data

# Handle Missing Values
data = handle_missing_values(data, strategy='mean')

In [9]:
# Define proxy variable
data['RiskCategory'] = data['FraudResult']

In [10]:
# Select observable features
features = ['Amount', 'Value']
X = data[features]
y = data['RiskCategory']
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model training

In [11]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

Hyperparameter Tuning

In [18]:
# Grid Search for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
best_rf

model evaluation

In [13]:
# Evaluation
models = {'Logistic Regression': log_reg, 'Random Forest': best_rf}
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}, ROC-AUC: {roc_auc}")


Logistic Regression - Accuracy: 0.9984320284325511, Precision: 0.8, Recall: 0.2222222222222222, F1 Score: 0.34782608695652173, ROC-AUC: 0.6110587468654181
Random Forest - Accuracy: 0.9998432028432551, Precision: 1.0, Recall: 0.9166666666666666, F1 Score: 0.9565217391304348, ROC-AUC: 0.9583333333333333


Risk Probability and Credit Score

In [17]:
# Assuming the model outputs a probability
probabilities = best_rf.predict_proba(X_test)[:, 1]

# Develop a mapping function for credit score
def risk_to_credit_score(prob):
    return 800 - prob * 700  # Simple linear transformation, can be more complex

credit_scores = np.array([risk_to_credit_score(p) for p in probabilities])
credit_scores

array([799.8263484, 800.       , 800.       , ..., 800.       ,
       800.       , 800.       ])

Predict Optimal Loan Amount and Duration

In [16]:
# Placeholder model, to be refined based on business logic and further analysis
def predict_loan_amount_and_duration(prob):
    if prob < 0.2:
        return 10000, 36  # Amount, Duration (months)
    elif prob < 0.5:
        return 5000, 24
    else:
        return 1000, 12

loan_predictions = np.array([predict_loan_amount_and_duration(p) for p in probabilities])
loan_predictions

array([[10000,    36],
       [10000,    36],
       [10000,    36],
       ...,
       [10000,    36],
       [10000,    36],
       [10000,    36]])