# UPI Fraud Detection - Model Training

This notebook focuses on building and training machine learning models to detect fraudulent UPI transactions.

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys

# Add project root to path for imports
sys.path.append('..')

# Import preprocessing functions
from preprocessing.data_processor import preprocess_data, engineer_features

# Feature processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.impute import SimpleImputer

# Model building and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# For reproducibility
np.random.seed(42)

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../attached_assets/Upi_fraud_dataset-checkpoint.csv')

# Display the first few rows
df.head()

In [None]:
# Check the shape of the dataset
print(f"Dataset shape: {df.shape}")

In [None]:
# Check data types and information
df.info()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
for col, count in zip(missing_values.index, missing_values.values):
    if count > 0:
        print(f"{col}: {count} ({count/len(df)*100:.2f}%)")

In [None]:
# Check the distribution of the target variable (FraudFlag)
fraud_distribution = df['FraudFlag'].value_counts(normalize=True) * 100
print("Distribution of fraud transactions:")
print(fraud_distribution)

## 3. Data Preprocessing

In [None]:
# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract TransactionFrequency numeric value
# Example format: '5/day', '3/day'
df['TransactionFrequencyValue'] = df['TransactionFrequency'].str.split('/').str[0].astype(int)

In [None]:
# Handle any missing values if present
# Fill missing numerical values with median
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Convert boolean columns to integers
bool_cols = ['UnusualLocation', 'UnusualAmount', 'NewDevice', 'FraudFlag']
for col in bool_cols:
    df[col] = df[col].astype(int)

In [None]:
# Check the data after preprocessing
df.info()

## 4. Feature Engineering

In [None]:
# Apply the feature engineering function
df_engineered = engineer_features(df)

# Display the new features
new_columns = [col for col in df_engineered.columns if col not in df.columns]
print("Newly created features:")
print(new_columns)
df_engineered[new_columns].head()

## 5. Data Preparation for Modeling

In [None]:
# Identify features and target
# Exclude non-predictive columns like IDs, Timestamp, etc.
exclude_cols = ['TransactionID', 'UserID', 'DeviceID', 'Timestamp', 'IPAddress', 'PhoneNumber', 'TransactionFrequency']
target_col = 'FraudFlag'

# Get features
features = [col for col in df_engineered.columns if col not in exclude_cols and col != target_col]
print(f"Number of features: {len(features)}")
print(f"Features: {features}")

In [None]:
# Separate features and target
X = df_engineered[features]
y = df_engineered[target_col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Identify categorical and numerical features
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
boolean_cols = ['UnusualLocation', 'UnusualAmount', 'NewDevice', 'IsWeekend', 'IsNightTime', 'HighRiskIP']

# Remove boolean columns from numerical_cols
numerical_cols = [col for col in numerical_cols if col not in boolean_cols]

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")
print(f"Boolean columns: {boolean_cols}")

In [None]:
# Preprocess the data
# 1. One-hot encoding for categorical features
if categorical_cols:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ohe.fit(X_train[categorical_cols])
    # Transform training data
    cat_features_train = ohe.transform(X_train[categorical_cols])
    cat_feature_names = ohe.get_feature_names_out(categorical_cols)
    # Transform test data
    cat_features_test = ohe.transform(X_test[categorical_cols])
else:
    cat_features_train = np.empty((X_train.shape[0], 0))
    cat_features_test = np.empty((X_test.shape[0], 0))
    cat_feature_names = []

# 2. Scaling numerical features
if numerical_cols:
    scaler = StandardScaler()
    scaler.fit(X_train[numerical_cols])
    # Transform training data
    num_features_train = scaler.transform(X_train[numerical_cols])
    # Transform test data
    num_features_test = scaler.transform(X_test[numerical_cols])
else:
    num_features_train = np.empty((X_train.shape[0], 0))
    num_features_test = np.empty((X_test.shape[0], 0))

# 3. Extract boolean features
if boolean_cols:
    bool_features_train = X_train[boolean_cols].values
    bool_features_test = X_test[boolean_cols].values
else:
    bool_features_train = np.empty((X_train.shape[0], 0))
    bool_features_test = np.empty((X_test.shape[0], 0))

# 4. Combine all features
X_train_processed = np.hstack((num_features_train, cat_features_train, bool_features_train))
X_test_processed = np.hstack((num_features_test, cat_features_test, bool_features_test))

# Create feature names for the processed data
processed_feature_names = numerical_cols + list(cat_feature_names) + boolean_cols

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

In [None]:
# Save preprocessing objects for later use in the app
preprocessing_objects = {
    'ohe': ohe,
    'scaler': scaler,
    'categorical_cols': categorical_cols,
    'numerical_cols': numerical_cols,
    'boolean_cols': boolean_cols,
    'final_features': processed_feature_names
}

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save preprocessing objects
with open('../models/preprocessing_objects.pkl', 'wb') as f:
    pickle.dump(preprocessing_objects, f)

## 6. Model Building and Evaluation

In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Get probabilities if available
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = y_pred
    
    # Generate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Calculate ROC-AUC if probabilities are available
    roc_auc = auc(roc_curve(y_test, y_proba)[0], roc_curve(y_test, y_proba)[1])
    
    # Compile results
    results = {
        'accuracy': report['accuracy'],
        'precision': report['1']['precision'] if '1' in report else report['1.0']['precision'],
        'recall': report['1']['recall'] if '1' in report else report['1.0']['recall'],
        'f1': report['1']['f1-score'] if '1' in report else report['1.0']['f1-score'],
        'roc_auc': roc_auc
    }
    
    return model, results

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'SVM': SVC(probability=True, class_weight='balanced', random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train and evaluate each model
results = {}
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    trained_model, model_results = evaluate_model(model, X_train_processed, X_test_processed, y_train, y_test)
    results[name] = model_results
    trained_models[name] = trained_model
    print(f"{name} - Accuracy: {model_results['accuracy']:.4f}, Precision: {model_results['precision']:.4f}, Recall: {model_results['recall']:.4f}, F1: {model_results['f1']:.4f}, ROC-AUC: {model_results['roc_auc']:.4f}\n")

In [None]:
# Visualize model performance
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
df_results = pd.DataFrame(results).T

plt.figure(figsize=(15, 10))
for i, metric in enumerate(metrics):
    plt.subplot(2, 3, i+1)
    sns.barplot(x=df_results.index, y=df_results[metric])
    plt.title(f'Model Comparison - {metric.upper()}')
    plt.xticks(rotation=45)
    plt.xlabel('Model')
    plt.ylabel(metric)
plt.tight_layout()
plt.show()

In [None]:
# Find the best model based on F1 score (good balance of precision and recall)
best_model_name = df_results['f1'].idxmax()
best_model = trained_models[best_model_name]
print(f"Best model based on F1 score: {best_model_name}")
print(f"F1 score: {df_results.loc[best_model_name, 'f1']:.4f}")
print(f"Accuracy: {df_results.loc[best_model_name, 'accuracy']:.4f}")
print(f"Precision: {df_results.loc[best_model_name, 'precision']:.4f}")
print(f"Recall: {df_results.loc[best_model_name, 'recall']:.4f}")
print(f"ROC-AUC: {df_results.loc[best_model_name, 'roc_auc']:.4f}")

In [None]:
# Confusion matrix for the best model
y_pred = best_model.predict(X_test_processed)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Fraud', 'Fraud'],
            yticklabels=['Not Fraud', 'Fraud'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# ROC Curve for all models
plt.figure(figsize=(10, 8))

for name, model in trained_models.items():
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test_processed)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc_score = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.4f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Feature Importance for the best model (if available)
if hasattr(best_model, 'feature_importances_'):
    # For tree-based models
    importances = best_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Get the top 15 features
    top_n = 15
    top_indices = indices[:top_n]
    top_features = [processed_feature_names[i] for i in top_indices]
    top_importances = importances[top_indices]
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x=top_importances, y=top_features)
    plt.title(f'Top {top_n} Feature Importances - {best_model_name}')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
elif hasattr(best_model, 'coef_'):
    # For linear models
    coefficients = best_model.coef_[0] if len(best_model.coef_.shape) > 1 else best_model.coef_
    abs_coefficients = np.abs(coefficients)
    indices = np.argsort(abs_coefficients)[::-1]
    
    # Get the top 15 features
    top_n = 15
    top_indices = indices[:top_n]
    top_features = [processed_feature_names[i] for i in top_indices]
    top_coefficients = coefficients[top_indices]
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x=top_coefficients, y=top_features)
    plt.title(f'Top {top_n} Feature Coefficients - {best_model_name}')
    plt.xlabel('Coefficient Value')
    plt.axvline(x=0, color='k', linestyle='--')
    plt.tight_layout()
    plt.show()

## 7. Save the Best Model

In [None]:
# Save the best model for use in the Streamlit app
with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"Best model ({best_model_name}) saved to ../models/best_model.pkl")

In [None]:
# Save model performance results for reference
df_results.to_csv('../models/model_results.csv')
print("Model performance results saved to ../models/model_results.csv")