In [1]:
!pip install pandas scikit-learn imbalanced-learn flask joblib flask-ngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
df = pd.read_csv('creditcard.csv')

# --- NEW STEP ---
# Drop any rows with missing values to ensure clean data for evaluation
df.dropna(inplace=True)

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Original training data size: {len(X_train)} samples")
print(f"Original test data size: {len(X_test)} samples")
print(f"Original training class distribution:\n{y_train.value_counts()}")

Original training data size: 112561 samples
Original test data size: 28141 samples
Original training class distribution:
Class
0.0    112347
1.0       214
Name: count, dtype: int64


In [3]:
# Scale the 'Time' and 'Amount' features
scaler = StandardScaler()
X_train[['Time', 'Amount']] = scaler.fit_transform(X_train[['Time', 'Amount']])
X_test[['Time', 'Amount']] = scaler.transform(X_test[['Time', 'Amount']])

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Resampled training data size: {len(X_resampled)} samples")
print(f"Resampled training class distribution:\n{y_resampled.value_counts()}")

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_resampled, y_resampled)

# Save the trained model and the scaler for deployment
joblib.dump(model, 'fraud_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

Resampled training data size: 224694 samples
Resampled training class distribution:
Class
0.0    112347
1.0    112347
Name: count, dtype: int64


['scaler.pkl']

In [4]:
# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("--- Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred))
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

--- Confusion Matrix ---
[[28085     6]
 [   10    40]]

--- Classification Report ---
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     28091
         1.0       0.87      0.80      0.83        50

    accuracy                           1.00     28141
   macro avg       0.93      0.90      0.92     28141
weighted avg       1.00      1.00      1.00     28141



In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1',
                           verbose=2,
                           n_jobs=-1)

# Fit the grid search to the resampled training data
grid_search.fit(X_resampled, y_resampled)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best F1-score: {grid_search.best_score_}")

# Use the best model for predictions
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

# Print the final classification report
print(classification_report(y_test, y_pred_tuned))

Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 

In [11]:
# Save the best model and the scaler
joblib.dump(grid_search.best_estimator_, 'fraud_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [6]:
import pandas as pd

metrics = {
    "Class": ["Legitimate", "Fraud"],
    "Precision": [1.0, 0.95],
    "Recall": [1.0, 0.85],
    "F1-Score": [1.0, 0.90],
    "Support": [28091, 50]  # replace with your actual numbers
}

df = pd.DataFrame(metrics)
df


Unnamed: 0,Class,Precision,Recall,F1-Score,Support
0,Legitimate,1.0,1.0,1.0,28091
1,Fraud,0.95,0.85,0.9,50


In [8]:
test_transaction = {
    "Time":400.0,"V1":-0.421,"V2":0.123,"V3":1.123,"V4":-0.456,"V5":-0.789,
    "V6":0.123,"V7":-0.456,"V8":0.789,"V9":-0.123,"V10":0.456,
    "V11":-0.789,"V12":0.123,"V13":-0.456,"V14":0.789,"V15":-0.123,
    "V16":0.456,"V17":-0.789,"V18":0.123,"V19":-0.456,"V20":0.789,
    "V21":-0.123,"V22":0.456,"V23":-0.789,"V24":0.123,"V25":-0.456,
    "V26":0.789,"V27":-0.123,"V28":0.456,"Amount":150.0
}

import pandas as pd
import joblib

model = joblib.load("fraud_model.pkl")
scaler = joblib.load("scaler.pkl")

df = pd.DataFrame(test_transaction, index=[0])
df[['Time', 'Amount']] = scaler.transform(df[['Time', 'Amount']])
pred = model.predict(df)

print("Prediction:", "Fraudulent" if pred[0]==1 else "Legitimate")


Prediction: Legitimate
