In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_excel('PSP_Jan_Feb_2019.xlsx')

import numpy as np
print("Initial dataset information:")
print(df.info())
print(df.describe())

# Handle missing values: Drop rows with missing values 
df.dropna(inplace=True)
# Remove duplicates
df.drop_duplicates(inplace=True)

# Check for erroneous timestamps and correct/remove them
df['tmsp'] = pd.to_datetime(df['tmsp'], errors='coerce')  # Convert to datetime, coerce errors to NaT
df = df.dropna(subset=['tmsp'])  # Remove rows where timestamp conversion failed

# Detect and handle outliers in the 'amount' column:Using the Interquartile Range (IQR) method
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df = df[(df['amount'] >= lower_bound) & (df['amount'] <= upper_bound)]

print("Cleaned dataset information:")
print(df.info())
print(df.describe())

Initial dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50410 entries, 0 to 50409
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Unnamed: 0  50410 non-null  int64         
 1   tmsp        50410 non-null  datetime64[ns]
 2   country     50410 non-null  object        
 3   amount      50410 non-null  int64         
 4   success     50410 non-null  int64         
 5   PSP         50410 non-null  object        
 6   3D_secured  50410 non-null  int64         
 7   card        50410 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 3.1+ MB
None
         Unnamed: 0        amount       success    3D_secured
count  50410.000000  50410.000000  50410.000000  50410.000000
mean   25204.500000    202.395715      0.202896      0.238266
std    14552.257872     96.274730      0.402160      0.426027
min        0.000000      6.000000      0.000000      0.000000

In [4]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Check the column names in the DataFrame
print("Column names in the DataFrame:")
print(df.columns)
# Perform initial data preparation steps if not done already
# Convert 'tmsp' to datetime format
df['tmsp'] = pd.to_datetime(df['tmsp'], errors='coerce')
# Remove rows with NaT in 'tmsp'
df = df.dropna(subset=['tmsp'])

# Feature Engineering: Retry Identification
df = df.sort_values(by='tmsp')
df['is_retry'] = (df['amount'].shift() == df['amount']) & \
                 (df['country'].shift() == df['country']) & \
                 ((df['tmsp'] - df['tmsp'].shift()).dt.seconds <= 60)
# Time-based features
df['hour'] = df['tmsp'].dt.hour
df['day_of_week'] = df['tmsp'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# Aggregated success rate per PSP
psp_success_rate = df.groupby('PSP')['success'].mean().to_dict()
df['psp_success_rate'] = df['PSP'].map(psp_success_rate)

# Encode categorical features: 'country', 'card', 'PSP'
label_encoders = {}
for column in ['country', 'card', 'PSP']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store encoder for possible inverse_transform later
# Double-check the DataFrame columns after preprocessing
print("DataFrame columns after preprocessing:")
print(df.columns)
# Ensure all necessary columns are present in the DataFrame
required_columns = ['amount', '3D_secured', 'is_retry', 'hour', 'day_of_week', 'psp_success_rate', 'country', 'card', 'PSP']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")
    
# Define features and target variable
X = df[required_columns]
y = df['success']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature scaling for Logistic Regression: Only scale numeric columns (amount, psp_success_rate, hour, day_of_week)
scaler = StandardScaler()
X_train[['amount', 'psp_success_rate', 'hour', 'day_of_week']] = scaler.fit_transform(X_train[['amount', 'psp_success_rate', 'hour', 'day_of_week']])
X_test[['amount', 'psp_success_rate', 'hour', 'day_of_week']] = scaler.transform(X_test[['amount', 'psp_success_rate', 'hour', 'day_of_week']])


Column names in the DataFrame:
Index(['Unnamed: 0', 'tmsp', 'country', 'amount', 'success', 'PSP',
       '3D_secured', 'card', 'is_retry', 'hour', 'day_of_week', 'is_weekend',
       'psp_success_rate'],
      dtype='object')
DataFrame columns after preprocessing:
Index(['Unnamed: 0', 'tmsp', 'country', 'amount', 'success', 'PSP',
       '3D_secured', 'card', 'is_retry', 'hour', 'day_of_week', 'is_weekend',
       'psp_success_rate'],
      dtype='object')
All required columns are present.
