In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# Feature Engineering
## Preprocessing

In [None]:
df = pd.read_csv('/content/data.csv')

In [None]:
df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [None]:
df.isnull().sum()

TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64

### Create Aggregate Features

In [None]:
# Calculate aggregate features
aggregate_features = df.groupby('AccountId').agg({
    'Amount': ['sum', 'mean', 'std', 'count'],
}).reset_index()

In [None]:
# Rename columns
aggregate_features.columns = [
    'AccountId', 'TotalTransactionAmount', 'AverageTransactionAmount',
    'StdDevTransactionAmount', 'TransactionCount'
]

In [None]:
aggregate_features

Unnamed: 0,AccountId,TotalTransactionAmount,AverageTransactionAmount,StdDevTransactionAmount,TransactionCount
0,AccountId_1,70000.0,2.333333e+04,5773.502692,3
1,AccountId_10,-3330347.0,-3.451137e+03,3567.343270,965
2,AccountId_100,180000.0,9.000000e+04,14142.135624,2
3,AccountId_1000,97000.0,9.700000e+03,16686.987612,10
4,AccountId_1002,2000000.0,2.000000e+06,,1
...,...,...,...,...,...
3628,AccountId_99,20173.0,1.551769e+03,1107.255929,13
3629,AccountId_990,25000.0,8.333333e+03,2886.751346,3
3630,AccountId_992,22000.0,3.142857e+03,1772.810521,7
3631,AccountId_994,1000.0,1.000000e+03,,1


In [None]:
# Merge aggregate features back into the main dataframe
df = df.merge(aggregate_features, on='AccountId', how='left')

### Extract Features

In [None]:
# Convert TransactionStartTime to datetime if not already done
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Extract time-based features
df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year

### Encode Categorical Variables
We will convert categorical variables into numerical formats using both One-Hot Encoding and Label Encoding.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Label Encoding for high cardinality categorical features
label_encoders = {}
for col in ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode']:
    le = LabelEncoder()
    df[col + '_LabelEnc'] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder for future use
    df.drop(col, axis=1, inplace=True)

In [None]:
# One-Hot Encoding for remaining categorical features
df = pd.get_dummies(df, columns=['ProductCategory', 'ChannelId', 'ProviderId', 'CountryCode', 'ProductId'])

In [None]:
df.dtypes[10:]

TransactionDay                        int32
TransactionMonth                      int32
TransactionYear                       int32
TransactionId_LabelEnc                int64
BatchId_LabelEnc                      int64
AccountId_LabelEnc                    int64
SubscriptionId_LabelEnc               int64
CustomerId_LabelEnc                   int64
CurrencyCode_LabelEnc                 int64
ProductCategory_airtime                bool
ProductCategory_data_bundles           bool
ProductCategory_financial_services     bool
ProductCategory_movies                 bool
ProductCategory_other                  bool
ProductCategory_ticket                 bool
ProductCategory_transport              bool
ProductCategory_tv                     bool
ProductCategory_utility_bill           bool
ChannelId_ChannelId_1                  bool
ChannelId_ChannelId_2                  bool
ChannelId_ChannelId_3                  bool
ChannelId_ChannelId_5                  bool
ProviderId_ProviderId_1         

### Handle Missing Values
We will handle missing values through imputation.*italicised text*

In [None]:
# Check for missing values
missing_values = df.isnull().sum()

In [None]:
missing_values[:30]

Amount                                  0
Value                                   0
TransactionStartTime                    0
PricingStrategy                         0
FraudResult                             0
TotalTransactionAmount                  0
AverageTransactionAmount                0
StdDevTransactionAmount               812
TransactionCount                        0
TransactionHour                         0
TransactionDay                          0
TransactionMonth                        0
TransactionYear                         0
TransactionId_LabelEnc                  0
BatchId_LabelEnc                        0
AccountId_LabelEnc                      0
SubscriptionId_LabelEnc                 0
CustomerId_LabelEnc                     0
CurrencyCode_LabelEnc                   0
ProductCategory_airtime                 0
ProductCategory_data_bundles            0
ProductCategory_financial_services      0
ProductCategory_movies                  0
ProductCategory_other             

In [None]:
# Impute missing values for numerical features
df['StdDevTransactionAmount'].fillna(df['StdDevTransactionAmount'].mean(), inplace=True)

### Normalize/Standardize Numerical Features
We will normalize or standardize numerical features to bring them onto a similar scale.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
# Select numerical features
numerical_features = ['TotalTransactionAmount', 'AverageTransactionAmount',
                      'StdDevTransactionAmount', 'TransactionCount',
                      'TransactionHour', 'TransactionDay',
                      'TransactionMonth', 'TransactionYear']

# Standardize numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Default Estimator and WoE Binning
Constructing an RFMS Score
RFMS stands for Recency, Frequency, Monetary, and Stability. These are key features used to evaluate user behavior in credit scoring.

- Recency (R): How recently the last transaction occurred.
- Frequency (F): How frequently transactions occur.
- Monetary (M): The monetary value of the transactions.
- Stability (S): Variability in transaction amounts.

We'll create an RFMS score for each user and then classify users into good (high RFMS score) and bad (low RFMS score) categories.

Calculate RFMS Components:
1. Recency: Calculate days since last transaction for each user.
2. Frequency: Count the number of transactions per user.
Monetary: Calculate the average transaction amount per user.
3. Stability: Calculate the standard deviation of transaction amounts per user.

In [None]:
# Calculate Recency (days since last transaction)
df['DaysSinceLastTransaction'] = (pd.Timestamp.now(tz='UTC') - df['TransactionStartTime']).dt.days
recency = df.groupby('AccountId_LabelEnc')['DaysSinceLastTransaction'].min().reset_index()
recency.columns = ['AccountId_LabelEnc', 'Recency']

In [None]:
# Calculate Frequency (number of transactions)
frequency = df.groupby('AccountId_LabelEnc')['TransactionId_LabelEnc'].count().reset_index()
frequency.columns = ['AccountId_LabelEnc', 'Frequency']

In [None]:
# Calculate Monetary (average transaction amount)
monetary = df.groupby('AccountId_LabelEnc')['Amount'].mean().reset_index()
monetary.columns = ['AccountId_LabelEnc', 'Monetary']

In [None]:
# Calculate Stability (standard deviation of transaction amounts)
stability = df.groupby('AccountId_LabelEnc')['Amount'].std().reset_index()
stability.columns = ['AccountId_LabelEnc', 'Stability']

In [None]:
# Merge RFMS components
rfms = recency.merge(frequency, on='AccountId_LabelEnc').merge(monetary, on='AccountId_LabelEnc').merge(stability, on='AccountId_LabelEnc')

In [None]:
# Standardize RFMS components
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
rfms[['Recency', 'Frequency', 'Monetary', 'Stability']] = scaler.fit_transform(rfms[['Recency', 'Frequency', 'Monetary', 'Stability']])

In [None]:
# Calculate RFMS score as a weighted sum (example weights, adjust as needed)
rfms['RFMS_Score'] = 0.25 * rfms['Recency'] + 0.25 * rfms['Frequency'] + 0.25 * rfms['Monetary'] + 0.25 * rfms['Stability']

In [None]:
rfms

Unnamed: 0,AccountId_LabelEnc,Recency,Frequency,Monetary,Stability,RFMS_Score
0,0,0.139483,-0.044856,0.021247,-0.093281,0.005648
1,1,-1.123842,1.804636,-0.136723,-0.112593,0.107870
2,2,0.250953,-0.046778,0.414437,-0.020026,0.149646
3,3,1.811531,-0.031398,-0.059160,0.002251,0.430806
4,4,-0.343553,-0.048701,11.679318,,
...,...,...,...,...,...,...
3628,3628,-1.086686,-0.025630,-0.107217,-0.134127,-0.338415
3629,3629,0.585363,-0.044856,-0.067220,-0.118550,0.088684
3630,3630,1.068399,-0.037166,-0.097833,-0.128301,0.201275
3631,3631,0.771146,-0.048701,-0.110471,,


In [None]:
rfms['RFMS_Score']

0       0.005648
1       0.107870
2       0.149646
3       0.430806
4            NaN
          ...   
3628   -0.338415
3629    0.088684
3630    0.201275
3631         NaN
3632    0.139538
Name: RFMS_Score, Length: 3633, dtype: float64

## Assign Good and Bad Labels
Set a threshold to classify users as good or bad based on their RFMS score.

In [None]:
# Set threshold (example threshold, adjust as needed)
threshold = rfms['RFMS_Score'].median()

In [None]:
threshold

-0.1701205961502279

In [None]:
# Assign labels
rfms['Label'] = np.where(rfms['RFMS_Score'] >= threshold, 'good', 'bad')

In [None]:
rfms.tail(5)

Unnamed: 0,AccountId_LabelEnc,Recency,Frequency,Monetary,Stability,RFMS_Score,Label
3628,3628,-1.086686,-0.02563,-0.107217,-0.134127,-0.338415,bad
3629,3629,0.585363,-0.044856,-0.06722,-0.11855,0.088684,good
3630,3630,1.068399,-0.037166,-0.097833,-0.128301,0.201275,good
3631,3631,0.771146,-0.048701,-0.110471,,,bad
3632,3632,0.845459,-0.037166,-0.109629,-0.140511,0.139538,good


## Perform Weight of Evidence (WoE) Binning
Weight of Evidence (WoE) is used to transform categorical variables into continuous ones while preserving the predictive power of the variables. We'll use the RFMS_Score and the Label to perform WoE binning.

In [None]:
# Function to calculate WoE and IV
def calculate_woe_iv(df, feature, target):
    eps = 0.00001  # a small number to prevent division by zero
    df = df.copy()
    df['good'] = np.where(df[target] == 'good', 1, 0)
    df['bad'] = np.where(df[target] == 'bad', 1, 0)
    grouped = df.groupby(feature).agg({'good': 'sum', 'bad': 'sum'})
    grouped['total'] = grouped['good'] + grouped['bad']
    grouped['percent_good'] = grouped['good'] / grouped['good'].sum()
    grouped['percent_bad'] = grouped['bad'] / grouped['bad'].sum()
    grouped['WoE'] = np.log((grouped['percent_good'] + eps) / (grouped['percent_bad'] + eps))
    grouped['IV'] = (grouped['percent_good'] - grouped['percent_bad']) * grouped['WoE']
    grouped = grouped.replace([np.inf, -np.inf], 0)  # replace infinite values with 0
    grouped['IV'] = grouped['IV'].sum()
    return grouped[['WoE', 'IV']]

In [None]:
# Calculate WoE and IV for RFMS_Score
woe_iv = calculate_woe_iv(rfms, 'RFMS_Score', 'Label')
woe_iv

Unnamed: 0_level_0,WoE,IV
RFMS_Score,Unnamed: 1_level_1,Unnamed: 2_level_1
-0.460737,-4.275582,9.194365
-0.379063,-4.275582,9.194365
-0.356484,-4.275582,9.194365
-0.356228,-4.275582,9.194365
-0.356004,-4.275582,9.194365
...,...,...
4.931027,4.274883,9.194365
6.198730,4.274883,9.194365
6.382426,4.274883,9.194365
14.492379,4.274883,9.194365


# Modelling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
rfms

Unnamed: 0,AccountId_LabelEnc,Recency,Frequency,Monetary,Stability,RFMS_Score,Label
0,0,0.139483,-0.044856,0.021247,-0.093281,0.005648,good
1,1,-1.123842,1.804636,-0.136723,-0.112593,0.107870,good
2,2,0.250953,-0.046778,0.414437,-0.020026,0.149646,good
3,3,1.811531,-0.031398,-0.059160,0.002251,0.430806,good
4,4,-0.343553,-0.048701,11.679318,,,bad
...,...,...,...,...,...,...,...
3628,3628,-1.086686,-0.025630,-0.107217,-0.134127,-0.338415,bad
3629,3629,0.585363,-0.044856,-0.067220,-0.118550,0.088684,good
3630,3630,1.068399,-0.037166,-0.097833,-0.128301,0.201275,good
3631,3631,0.771146,-0.048701,-0.110471,,,bad


In [None]:
# Define features and target
features = rfms.drop(columns=['AccountId_LabelEnc', 'Label'])
target = rfms['Label']

In [None]:
features.isnull().sum()

Recency         0
Frequency       0
Monetary        0
Stability     812
RFMS_Score    812
dtype: int64

In [None]:
features['Stability'].fillna(features['Stability'].mean(), inplace=True)
features['RFMS_Score'].fillna(features['RFMS_Score'].mean(), inplace=True)

In [None]:
features.isnull().sum()

Recency       0
Frequency     0
Monetary      0
Stability     0
RFMS_Score    0
dtype: int64

### Split the Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

### Choose Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
# Initialize models
logistic_model = LogisticRegression(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
gbm_model = GradientBoostingClassifier(random_state=42)

In [None]:
# Train models
logistic_model.fit(X_train, y_train)

In [None]:
random_forest_model.fit(X_train, y_train)

In [None]:
gbm_model.fit(X_train, y_train)

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Logistic Regression hyperparameter tuning
logistic_params = {'C': [0.1, 1, 10, 100]}
logistic_grid = GridSearchCV(logistic_model, logistic_params, cv=5, scoring='roc_auc')
logistic_grid.fit(X_train, y_train)

In [None]:
# Random Forest hyperparameter tuning
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
rf_grid = GridSearchCV(random_forest_model, rf_params, cv=5, scoring='roc_auc')
rf_grid.fit(X_train, y_train)

In [None]:
# GBM hyperparameter tuning
gbm_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]}
gbm_grid = GridSearchCV(gbm_model, gbm_params, cv=5, scoring='roc_auc')
gbm_grid.fit(X_train, y_train)

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

In [None]:
def evaluate_model(model, X_test, y_test, threshold=0.5):  # Add threshold parameter
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)  # Apply threshold

    # Map numerical predictions to string labels
    y_pred_mapped = ['good' if p == 1 else 'bad' for p in y_pred]

    accuracy = accuracy_score(y_test, y_pred_mapped)
    precision = precision_score(y_test, y_pred_mapped, pos_label='good')
    recall = recall_score(y_test, y_pred_mapped, pos_label='good')
    f1 = f1_score(y_test, y_pred_mapped, pos_label='good')
    roc_auc = roc_auc_score(y_test, y_prob)

    return accuracy, precision, recall, f1, roc_auc

In [None]:
# Evaluate Logistic Regression
logistic_metrics = evaluate_model(logistic_grid.best_estimator_, X_test, y_test)
print(f"Logistic Regression: Accuracy: {logistic_metrics[0]}, Precision: {logistic_metrics[1]}, Recall: {logistic_metrics[2]}, F1 Score: {logistic_metrics[3]}, ROC-AUC: {logistic_metrics[4]}")

Logistic Regression: Accuracy: 0.8825688073394495, Precision: 0.8571428571428571, Recall: 0.8411214953271028, F1 Score: 0.8490566037735849, ROC-AUC: 0.9565533500861169


In [None]:
# Try different thresholds
logistic_metrics_05 = evaluate_model(logistic_grid.best_estimator_, X_test, y_test, threshold=0.5)
logistic_metrics_04 = evaluate_model(logistic_grid.best_estimator_, X_test, y_test, threshold=0.4)
logistic_metrics_03 = evaluate_model(logistic_grid.best_estimator_, X_test, y_test, threshold=0.3)

# Print the results for comparison
print("Logistic Regression (threshold 0.5):", logistic_metrics_05)
print("Logistic Regression (threshold 0.4):", logistic_metrics_04)
print("Logistic Regression (threshold 0.3):", logistic_metrics_03)

Logistic Regression (threshold 0.5): (0.8825688073394495, 0.8571428571428571, 0.8411214953271028, 0.8490566037735849, 0.9565533500861169)
Logistic Regression (threshold 0.4): (0.8724770642201835, 0.8203991130820399, 0.8644859813084113, 0.8418657565415245, 0.9565533500861169)
Logistic Regression (threshold 0.3): (0.8550458715596331, 0.7710843373493976, 0.897196261682243, 0.8293736501079914, 0.9565533500861169)


In [None]:
# Assuming y_test is a pandas Series
print(y_test.value_counts())

Label
bad     662
good    428
Name: count, dtype: int64


In [None]:
# Evaluate Random Forest
rf_metrics = evaluate_model(rf_grid.best_estimator_, X_test, y_test)
print(f"Random Forest: Accuracy: {rf_metrics[0]}, Precision: {rf_metrics[1]}, Recall: {rf_metrics[2]}, F1 Score: {rf_metrics[3]}, ROC-AUC: {rf_metrics[4]}")

Random Forest: Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0, ROC-AUC: 1.0


In [None]:
# Evaluate GBM
gbm_metrics = evaluate_model(gbm_grid.best_estimator_, X_test, y_test)
print(f"Gradient Boosting Machines: Accuracy: {gbm_metrics[0]}, Precision: {gbm_metrics[1]}, Recall: {gbm_metrics[2]}, F1 Score: {gbm_metrics[3]}, ROC-AUC: {gbm_metrics[4]}")

Gradient Boosting Machines: Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0, ROC-AUC: 1.0
