In [1]:
# px requires nbformat 
from datasets import load_dataset

import pandas as pd
import numpy as np
from collections import deque
from sortedcontainers import SortedList

import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import matplotlib.dates as mdates 
import seaborn as sns
import plotly.express as px

import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('/Users/cherylsusan/Desktop/data/train_data.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990281 entries, 0 to 1990280
Data columns (total 34 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   ssn                             object 
 1   gender                          object 
 2   state                           object 
 3   zip                             int64  
 4   city_pop                        int64  
 5   job                             object 
 6   category                        object 
 7   amt                             float64
 8   is_fraud                        int64  
 9   merchant                        object 
 10  hour                            int64  
 11  day_of_week                     object 
 12  month                           object 
 13  trans_quarter                   object 
 14  year                            int64  
 15  age                             int64  
 16  city_state                      object 
 17  prev_trans_fraud           

In [4]:
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [5]:

X = train_df.drop(['is_fraud'], axis=1) 
y = train_df['is_fraud']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Print class distribution before resampling
print("Class distribution before resampling:")
print(y.value_counts())
print(f"Percentage of fraud cases: {round(y.mean() * 100, 2)}%")
print(f"Original dataset shape: {train_df.shape}")



Class distribution before resampling:
is_fraud
0    1982985
1       7296
Name: count, dtype: int64
Percentage of fraud cases: 0.37%
Original dataset shape: (1990281, 34)


In [None]:
# # Create preprocessing for categorical data
# # We'll use OneHotEncoder for categorical features
# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# X_cat = encoder.fit_transform(X[categorical_features])
# # Get the new column names after one-hot encoding
# encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# # Standardize numerical features
# scaler = StandardScaler()
# X_num = scaler.fit_transform(X[numerical_features])

# # Combine the preprocessed data
# X_processed = np.concatenate([X_num, X_cat], axis=1)
# # Create a list of all feature names after preprocessing
# all_feature_names = numerical_features + encoded_feature_names.tolist()




: 

In [None]:
# Apply SMOTE-Tomek resampling
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_processed, y)

# Print class distribution after resampling
print("\nClass distribution after SMOTE-Tomek:")
print(pd.Series(y_resampled).value_counts())
print(f"Percentage of fraud cases: {round(pd.Series(y_resampled).mean() * 100, 2)}%")
print(f"Resampled data shape: {X_resampled.shape}")

# Convert the resampled data back to a DataFrame with proper column names
# Split the resampled data back into numerical and categorical parts
n_numerical = len(numerical_features)
X_num_resampled = X_resampled[:, :n_numerical]
X_cat_resampled = X_resampled[:, n_numerical:]

# Create DataFrames for each part
num_df = pd.DataFrame(X_num_resampled, columns=numerical_features)
cat_df = pd.DataFrame(X_cat_resampled, columns=encoded_feature_names)

# If you need to convert one-hot encoded features back to original categorical features
inverse_cat_df = pd.DataFrame()
for feature in categorical_features:
    # Get all columns related to this feature
    cols = [col for col in cat_df.columns if col.startswith(f"{feature}_")]
    # For each row, find the column with the highest value (or 1 in case of perfect one-hot)
    inverse_cat_df[feature] = cat_df[cols].idxmax(axis=1).str.replace(f"{feature}_", "")

# Combine numerical and reconstructed categorical features
resampled_df = pd.concat([num_df, inverse_cat_df], axis=1)

# Add the target variable
resampled_df['is_fraud'] = y_resampled

print(f"Final resampled DataFrame shape: {resampled_df.shape}")

# If you need to save the resampled dataset
# resampled_df.to_csv('train_df_resampled.csv', index=False)

# The resampled_df can now be used directly for model training

In [None]:
#selectKbest 

print("\nStep 3: Feature Selection with SelectKBest")
# Choose k - number of features to select (adjust as needed)
k = min(30, X_train_resampled.shape[1])  # Select top 30 features or all if fewer
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
X_test_selected = selector.transform(X_test_processed)

# Get selected feature names for interpretation
mask = selector.get_support()
selected_features = [feature_names[i] for i in range(len(feature_names)) if mask[i]]
print(f"Selected {k} features:")
for i, feature in enumerate(selected_features):
    print(f"{i+1}. {feature}")


In [None]:
Train Random Forest Classifier
print("\nStep 4: Training Random Forest Classifier")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

rf_model.fit(X_train_selected, y_train_resampled)

In [None]:
# Step 5: Model Evaluation
print("\nStep 5: Model Evaluation")
# Predictions on test set
y_pred = rf_model.predict(X_test_selected)
y_pred_proba = rf_model.predict_proba(X_test_selected)[:, 1]

# Calculate various metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {round(accuracy * 100, 2)}%")
print(f"ROC AUC Score: {round(auc, 4)}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.ylabel("Actual Label")
plt.xlabel("Predicted Label")
plt.show()

# Feature Importance Analysis
print("\nTop 20 Most Important Features:")
feature_importances = rf_model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

for i in range(min(20, len(selected_features))):
    print(f"{selected_features[indices[i]]}: {feature_importances[indices[i]]:.4f}")

# Plot feature importance
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(min(20, len(selected_features))), 
        feature_importances[indices[:20]],
        align="center")
plt.xticks(range(min(20, len(selected_features))), 
           [selected_features[i] for i in indices[:20]], 
           rotation=90)
plt.tight_layout()
plt.show()