In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, matthews_corrcoef, confusion_matrix, classification_report


In [2]:
df = pd.read_csv('creditcard.csv')  
print(df.shape)
print(df['Class'].value_counts(normalize=True))


(284807, 31)
Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64


In [3]:
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])


In [4]:
X = df.drop('Class', axis=1)
y = df['Class']


In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("After SMOTE:", np.bincount(y_resampled))




After SMOTE: [284315 284315]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
import pandas as pd
import time
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, matthews_corrcoef

X_train_sample = X_train.copy()
y_train_sample = y_train.copy()

new_models = {
    "SGD (SVM Approximation)": SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
}

task4_results = []

for name, model in new_models.items():
    print(f"Training {name}...")
    start_time = time.time()

    model.fit(X_train_sample, y_train_sample)
    duration = time.time() - start_time

    print(f"Finished training {name} in {duration:.2f}s")

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)

    task4_results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'MCC': mcc,
        'Training Time (s)': duration
    })

task4_results_df = pd.DataFrame(task4_results)
pd.set_option('display.float_format', '{:.4f}'.format)
print("\nNew Model Performance:")
print(task4_results_df)


Training SGD (SVM Approximation)...
Finished training SGD (SVM Approximation) in 17.46s
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Finished training XGBoost in 0.67s

New Model Performance:
                     Model  Accuracy  Precision    MCC  Training Time (s)
0  SGD (SVM Approximation)    0.5026     0.5018 0.0398            17.4562
1                  XGBoost    0.9996     0.9992 0.9992             0.6738
