In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from collections import Counter

# Load your processed data (adjust the path as necessary)
df = pd.read_csv('processed_data.csv')

# Step 1: Drop non-numeric and identifier columns
columns_to_drop = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 
                   'CustomerId', 'CurrencyCode', 'ProviderId', 'ProductId', 
                   'ProductCategory', 'ChannelId', 'TransactionStartTime']  # Dropping date column

df_cleaned = df.drop(columns=columns_to_drop)

# Step 2: Define features (X) and target (y)
y = df_cleaned['User_Label']  # Target variable
X = df_cleaned.drop(columns=['User_Label'])  # Features

# Step 3: Handle missing values (if needed)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Step 4: Split the data into training and testing sets using stratification
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Check the class distribution in the training set
class_counts = Counter(y_train)
print("Training set class distribution:\n", class_counts)

# Ensure there are at least 2 classes in the training data
if len(class_counts) < 2:
    print("Warning: Training data contains only one class.")
    print("Consider applying class balancing techniques or gathering more data.")
else:
    # Step 6: Train the models
    # Logistic Regression
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)

    # Random Forest
    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)

    # Step 7: Evaluate the models
    # Evaluate Logistic Regression
    y_pred_log_reg = log_reg.predict(X_test)
    accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
    precision_log_reg = precision_score(y_test, y_pred_log_reg)
    recall_log_reg = recall_score(y_test, y_pred_log_reg)
    f1_log_reg = f1_score(y_test, y_pred_log_reg)
    roc_auc_log_reg = roc_auc_score(y_test, y_pred_log_reg)

    # Evaluate Random Forest
    y_pred_rf = random_forest.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    precision_rf = precision_score(y_test, y_pred_rf)
    recall_rf = recall_score(y_test, y_pred_rf)
    f1_rf = f1_score(y_test, y_pred_rf)
    roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

    # Print results
    print("Logistic Regression Metrics:")
    print(f"Accuracy: {accuracy_log_reg:.4f}, Precision: {precision_log_reg:.4f}, Recall: {recall_log_reg:.4f}, F1 Score: {f1_log_reg:.4f}, ROC AUC: {roc_auc_log_reg:.4f}")
    print("\nRandom Forest Metrics:")
    print(f"Accuracy: {accuracy_rf:.4f}, Precision: {precision_rf:.4f}, Recall: {recall_rf:.4f}, F1 Score: {f1_rf:.4f}, ROC AUC: {roc_auc_rf:.4f}")


Training set class distribution:
 Counter({0: 76529})
Consider applying class balancing techniques or gathering more data.


