In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE
import pickle


In [None]:
# Step 2: Load Dataset
df = pd.read_csv("creditcard.csv")  # Download from Kaggle and keep in the same folder
df.head()
# Step 3: Basic Info
df.info()
df['Class'].value_counts()  # 0 = legit, 1 = fraud


In [None]:
# Check imbalance
sns.countplot(x='Class', data=df)
plt.title("Class Distribution")
plt.show()


In [None]:
# Step 4: Preprocessing

# Scale only the 'Amount' column
scaler = StandardScaler()
df['scaled_amount'] = scaler.fit_transform(df[['Amount']])

# Drop unneeded columns
df.drop(['Time', 'Amount'], axis=1, inplace=True)

# Define features and target
X = df.drop('Class', axis=1)
y = df['Class']


In [None]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 6: Handle Imbalanced Data with SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_res.value_counts())


In [None]:
# Step 7: Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_res, y_res)


In [None]:
# Combine X_test and y_test to remove rows where y_test is NaN
X_test_clean = X_test[~y_test.isna()]
y_test_clean = y_test[~y_test.isna()]


In [None]:
# Predict on cleaned data
y_pred = model.predict(X_test_clean)

# Evaluation
print("Accuracy:", accuracy_score(y_test_clean, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_clean, y_pred))
print("Classification Report:\n", classification_report(y_test_clean, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test_clean, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:


# assuming `model` is your trained model object
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
