In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# 🔹 Load dataset
df = pd.read_csv("cyber_fraud_dataset.csv")

# 🔹 Convert target variable to binary (Yes → 1, No → 0)
df["Fraud Detected"] = df["Fraud Detected"].map({"Yes": 1, "No": 0})

# 🔹 Define categorical columns
categorical_cols = [
    "Fraud Type", "Multiple Login Attempts", "Unusual IP Address", "Previous Fraud History", 
    "Compromised Credentials Used", "Malicious Link Clicked", "Blacklisted Entity Involved", 
    "Dark Web Involvement", "Transaction Time", "Unusual Location Detected", "Account Type", 
    "Social Engineering Involvement", "Previous Suspicious Activity", "Data Breach Exposure"
]

# 🔹 Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  
    label_encoders[col] = le  # Store encoders for later use

# 🔹 Define features (X) and target (y)
X = df.drop(columns=["Fraud Detected"])
y = df["Fraud Detected"]

# 🔹 Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 🔹 Scale numerical features
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)

# 🔹 Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# 🔹 Train Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=500, max_depth=20, min_samples_split=2, min_samples_leaf=3,
    max_features='sqrt', class_weight="balanced", random_state=42, n_jobs=-1
)
rf.fit(X_train, y_train)

# 🔹 Make predictions
y_pred = rf.predict(X_test)

# 🔹 Evaluate Model the accuracy, precision and recall are to be computed but
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Precision: {precision:.4f}")
print(f"✅ Recall: {recall:.4f}")  
print(f"✅ F1-Score: {f1:.4f}\n")
print("📊 Classification Report:\n", classification_report(y_test, y_pred))

# 🚀 Function to Predict Custom Input
def predict_custom_input(custom_data):
    # Convert to DataFrame
    custom_df = pd.DataFrame([custom_data])

    # 🔹 Encode categorical columns safely
    for col in categorical_cols:
        if col in label_encoders:  # Ensure encoder exists
            if custom_df[col][0] in label_encoders[col].classes_:
                custom_df[col] = label_encoders[col].transform(custom_df[col])
            else:
                print(f"⚠️ Warning: Unseen category '{custom_df[col][0]}' in column '{col}', assigning default.")
                custom_df[col] = label_encoders[col].transform([label_encoders[col].classes_[0]])  # Assign first known category

 

✅ Accuracy: 0.7163
✅ Precision: 0.7705
✅ Recall: 0.6104
✅ F1-Score: 0.6812

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.82      0.74       703
           1       0.77      0.61      0.68       693

    accuracy                           0.72      1396
   macro avg       0.73      0.72      0.71      1396
weighted avg       0.73      0.72      0.71      1396



In [None]:
def predict_custom_input(custom_data):
    # Convert to DataFrame
    custom_df = pd.DataFrame([custom_data])

    # Encode categorical columns
    for col in categorical_cols:
        if col in label_encoders:
            le = label_encoders[col]
            val = custom_df[col][0]
            if val in le.classes_:
                custom_df[col] = le.transform([val])
            else:
                print(f"⚠️ Unseen category '{val}' in '{col}', defaulting to first known class.")
                default_val = le.transform([le.classes_[0]])
                custom_df[col] = default_val

    # Add missing numerical columns with default values
    for col in numerical_cols:
        if col not in custom_df.columns:
            custom_df[col] = 0

    # Align column order
    custom_df = custom_df[X.columns]

    # Convert all to float (not int)
    custom_df = custom_df.astype(float)

    # Scale the features
    scaled_df = scaler.transform(custom_df)

    # Debug output
    print("🧾 Final input to model:")
    print(scaled_df)

    # Predict
    prediction = rf.predict(scaled_df)[0]
    fraud_label = "Yes" if prediction == 1 else "No"
    return fraud_label


🔍 Prediction: Fraud Detected → No


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pickle

# Load your dataset
df = pd.read_csv("cyber_fraud_dataset.csv")  # Replace with your actual CSV file path

# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()

# Define target and features
target_column = "Fraud Detected"
categorical_cols = [
    "Fraud Type", "Multiple Login Attempts", "Unusual IP Address", "Previous Fraud History",
    "Compromised Credentials Used", "Malicious Link Clicked", "Blacklisted Entity Involved",
    "Dark Web Involvement", "Transaction Time", "Unusual Location Detected", "Account Type",
    "Social Engineering Involvement", "Previous Suspicious Activity", "Data Breach Exposure"
]
numerical_cols = [
    "Transaction Amount ($)", "Device Risk Score", "Login Frequency (Per Day)",
    "Transaction Location Risk", "Account Age (Days)"
]

# Encode target
# Encode target
df[target_column] = df[target_column].astype(str).str.strip().map({"No": 0, "Yes": 1})

# Drop any rows where target mapping failed
df = df.dropna(subset=[target_column])


# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    df[col] = df[col].astype(str).str.strip()  # Strip extra spaces
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separate features and label
X = df.drop(target_column, axis=1)
y = df[target_column]

# Scale numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Save model and preprocessing objects
with open("model.pkl", "wb") as f:
    pickle.dump(rf, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

with open("X_columns.pkl", "wb") as f:
    pickle.dump(X, f)

print("✅ Model and preprocessing files saved!")


✅ Model and preprocessing files saved!
