In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define a function to process data chunk by chunk
def process_data(file_path, chunk_size=10000):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Drop rows with missing values
        chunk.dropna(inplace=True)

        # Encode categorical variables
        for col in chunk.select_dtypes(include=['object']).columns:
            chunk[col] = LabelEncoder().fit_transform(chunk[col])

        chunks.append(chunk)

    return pd.concat(chunks, ignore_index=True)

In [4]:
# Process the training and test data
train_data = process_data('fraudTrain.csv')
test_data = process_data('fraudTest.csv')

In [5]:
train_data, test_data = train_data.align(test_data, join='inner', axis=1)

In [6]:
# Extract the target variable 'is_fraud'
y_train = train_data['is_fraud']
X_train = train_data.drop('is_fraud', axis=1)

y_test = test_data['is_fraud']
X_test = test_data.drop('is_fraud', axis=1)

In [7]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Initialize models
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

In [9]:
# Train models
log_reg.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

In [10]:
# Predictions
log_reg_pred = log_reg.predict(X_test)
decision_tree_pred = decision_tree.predict(X_test)
random_forest_pred = random_forest.predict(X_test)

In [11]:
print("Logistic Regression:")
print(classification_report(y_test, log_reg_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, log_reg_pred))
print("Accuracy:", accuracy_score(y_test, log_reg_pred))

Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

Confusion Matrix:
 [[553494     80]
 [  2145      0]]
Accuracy: 0.9959961779244546


In [12]:
print("\nDecision Tree:")
print(classification_report(y_test, decision_tree_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, decision_tree_pred))
print("Accuracy:", accuracy_score(y_test, decision_tree_pred))


Decision Tree:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.23      0.37      0.28      2145

    accuracy                           0.99    555719
   macro avg       0.61      0.68      0.64    555719
weighted avg       0.99      0.99      0.99    555719

Confusion Matrix:
 [[550905   2669]
 [  1354    791]]
Accuracy: 0.9927607297932948


In [13]:
print("\nRandom Forest:")
print(classification_report(y_test, random_forest_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, random_forest_pred))
print("Accuracy:", accuracy_score(y_test, random_forest_pred))


Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.84      0.38      0.52      2145

    accuracy                           1.00    555719
   macro avg       0.92      0.69      0.76    555719
weighted avg       1.00      1.00      1.00    555719

Confusion Matrix:
 [[553416    158]
 [  1329    816]]
Accuracy: 0.9973241872241186
