In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Load the dataset
df = pd.read_excel('PSP_Jan_Feb_2019.xlsx')
df['tmsp'] = pd.to_datetime(df['tmsp'], errors='coerce')
# Remove rows with NaT in 'tmsp'
df = df.dropna(subset=['tmsp'])

# Feature Engineering: Retry Identification
df = df.sort_values(by='tmsp')
df['is_retry'] = (df['amount'].shift() == df['amount']) & \
                 (df['country'].shift() == df['country']) & \
                 ((df['tmsp'] - df['tmsp'].shift()).dt.seconds <= 60)
# Time-based features
df['hour'] = df['tmsp'].dt.hour
df['day_of_week'] = df['tmsp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# Aggregated success rate per PSP
psp_success_rate = df.groupby('PSP')['success'].mean().to_dict()
df['psp_success_rate'] = df['PSP'].map(psp_success_rate)

# Encode categorical features: 'country', 'card', 'PSP'
label_encoders = {}
for column in ['country', 'card', 'PSP']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store encoder for possible inverse_transform later
# Double-check the DataFrame columns after preprocessing
print("DataFrame columns after preprocessing:")
print(df.columns)
# Ensure all necessary columns are present in the DataFrame
required_columns = ['amount', '3D_secured', 'is_retry', 'hour', 'day_of_week', 'psp_success_rate', 'country', 'card', 'PSP']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")
    
# Define features and target variable
X = df[required_columns]
y = df['success']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature scaling for Logistic Regression: Only scale numeric columns (amount, psp_success_rate, hour, day_of_week)
scaler = StandardScaler()
X_train[['amount', 'psp_success_rate', 'hour', 'day_of_week']] = scaler.fit_transform(X_train[['amount', 'psp_success_rate', 'hour', 'day_of_week']])
X_test[['amount', 'psp_success_rate', 'hour', 'day_of_week']] = scaler.transform(X_test[['amount', 'psp_success_rate', 'hour', 'day_of_week']])
# Baseline Model: Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
# Evaluate the Logistic Regression model
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Model Performance:")
print(classification_report(y_test, y_pred_log_reg))
print(confusion_matrix(y_test, y_pred_log_reg))

# Advanced Model: Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Model Performance:")
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters from Grid Search
print(f"Best Parameters from Grid Search: {grid_search.best_params_}")

# Best model from Grid Search
best_rf_model = grid_search.best_estimator_

# Evaluate the best Random Forest model
y_pred_best_rf = best_rf_model.predict(X_test)
print("Best Random Forest Model Performance after Grid Search:")
print(classification_report(y_test, y_pred_best_rf))
print(confusion_matrix(y_test, y_pred_best_rf))

DataFrame columns after preprocessing:
Index(['Unnamed: 0', 'tmsp', 'country', 'amount', 'success', 'PSP',
       '3D_secured', 'card', 'is_retry', 'hour', 'day_of_week', 'is_weekend',
       'psp_success_rate'],
      dtype='object')
All required columns are present.
Logistic Regression Model Performance:
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      8163
           1       0.58      0.02      0.05      1919

    accuracy                           0.81     10082
   macro avg       0.70      0.51      0.47     10082
weighted avg       0.77      0.81      0.73     10082

[[8129   34]
 [1872   47]]
Random Forest Model Performance:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      8163
           1       0.32      0.14      0.20      1919

    accuracy                           0.78     10082
   macro avg       0.57      0.54      0.53     10082
weighted avg       0.73      0.78