In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
import joblib

# Load the logistic regression model
logistic_regression_model = joblib.load(r'C:\Users\yordi\Credit-card-fraud-detection-master\logistic_regression_model.pkl')

# Read Data into a Dataframe
df = pd.read_csv('creditcard.csv')

# Display basic information and check for missing values
print(df.shape)
print(df.isnull().sum().sum())

# Descriptive statistics
print(df.describe())
# Drop high amount transactions
df = df[df['Amount'] < 10000]

# Feature Scaling using RobustScaler
rob_scaler = RobustScaler()
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df.drop(['Time', 'Amount'], axis=1, inplace=True)
scaled_amount = df['scaled_amount']
df.drop(['scaled_amount'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)

# Splitting the dataset
X = df.drop('Class', axis=1)
y = df['Class']
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in sss.split(X, y):
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
# Converting to arrays
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# Check class distribution in train and test sets
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print("Label distributions: \n")
print(train_counts_label / len(original_ytrain))
print(test_counts_label / len(original_ytest))

# Undersampling the dataset to balance class distribution
df = df.sample(frac=1)
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]
normal_distributed_df = pd.concat([fraud_df, non_fraud_df])
new_df = normal_distributed_df.sample(frac=1, random_state=42)

# Outlier detection and removal
def remove_outliers(df, feature):
    fraud = df[feature].loc[df['Class'] == 1].values
    q25, q75 = np.percentile(fraud, 25), np.percentile(fraud, 75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    df = df.drop(df[(df[feature] > upper) | (df[feature] < lower)].index)
    return df

new_df = remove_outliers(new_df, 'V14')
new_df = remove_outliers(new_df, 'V12')
new_df = remove_outliers(new_df, 'V10')

# Dimensionality Reduction
X = new_df.drop('Class', axis=1)
y = new_df['Class']

# Resampling (SMOTE)
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Model Evaluation Helper Function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    print("Accuracy: ", accuracy)
    print("Precision: ", precision_score(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("ROC AUC: ", roc_auc_score(y_test, y_pred))
    return accuracy

# Define algorithms
estimators = [
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svm', SVC(probability=True))
]
algorithms = {
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Stochastic Gradient Descent': SGDClassifier(),
    'Stacking Classifier': StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
}

# Dictionary to store accuracy scores
accuracy_scores = {}

# Evaluate each model
for name, model in algorithms.items():
    print(f"Training and evaluating {name}...")
    accuracy = evaluate_model(model, X_train, X_test, y_train, y_test)
    accuracy_scores[name] = accuracy
    print("------------------------------------")

# Select the best model based on accuracy
best_model_name = max(accuracy_scores, key=accuracy_scores.get)
best_model = algorithms[best_model_name]
print(f"The best model is {best_model_name} with an accuracy of {accuracy_scores[best_model_name]}")

# Example: Using the best model for prediction
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Save the updated model
joblib.dump(best_model, r'C:\Users\yordi\Credit-card-fraud-detection-master\logistic_regression_model.pkl')

print("Model updated and saved successfully.")


(284807, 31)
0
                Time            V1            V2            V3            V4  \
count  284807.000000  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean    94813.859575  1.168375e-15  3.416908e-16 -1.379537e-15  2.074095e-15   
std     47488.145955  1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00   
min         0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00   
25%     54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01   
50%     84692.000000  1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02   
75%    139320.500000  1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01   
max    172792.000000  2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01   

                 V5            V6            V7            V8            V9  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   9.604066e-16  1.487313e-15 -5.556467e-16  1.213481e-16 -2.406331e-15   
std    1.380247e+00  1.3322