In [2]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [3]:
fraud_data = pd.read_csv(
    Path('prepped_data.csv'))
fraud_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,0,1,109,4931,54641,165,165,2368,287,165,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,1,192,9533,54641,256,302,1456,390,256,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,2,1,128,7179,45359,37,37,452,37,37,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,3,1,200,10221,45359,43,43,967,201,43,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,4,1,120,6065,45359,176,176,2312,176,176,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [4]:
fraud_data = fraud_data.drop(columns=['Unnamed: 0'])
fraud_data.head()

Unnamed: 0,trans_date_trans_timefreq_enc,merchantfreq_enc,categoryfreq_enc,genderfreq_enc,streetfreq_enc,cityfreq_enc,statefreq_enc,jobfreq_enc,dobfreq_enc,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,1,109,4931,54641,165,165,2368,287,165,4.97,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0
1,1,192,9533,54641,256,302,1456,390,256,107.23,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0
2,1,128,7179,45359,37,37,452,37,37,220.11,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0
3,1,200,10221,45359,43,43,967,201,43,45.0,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0
4,1,120,6065,45359,176,176,2312,176,176,41.96,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0


In [5]:
fraud_data['is_fraud'].value_counts()

is_fraud
0    99010
1      990
Name: count, dtype: int64

In [6]:
fraud_data.dtypes

trans_date_trans_timefreq_enc      int64
merchantfreq_enc                   int64
categoryfreq_enc                   int64
genderfreq_enc                     int64
streetfreq_enc                     int64
cityfreq_enc                       int64
statefreq_enc                      int64
jobfreq_enc                        int64
dobfreq_enc                        int64
amt                              float64
zip                                int64
lat                              float64
long                             float64
city_pop                           int64
unix_time                          int64
merch_lat                        float64
merch_long                       float64
is_fraud                           int64
dtype: object

In [7]:
X = fraud_data.drop('is_fraud', axis=1)
y = fraud_data['is_fraud']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [11]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test_scaled)

In [12]:
logisticreg = LogisticRegression(max_iter=10000)
logisticreg.fit(X_train_pca, y_train_smote)

In [13]:
y_pred = logisticreg.predict(X_test_pca)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9371


In [15]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[27879  1819]
 [   67   235]]


In [16]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97     29698
           1       0.11      0.78      0.20       302

    accuracy                           0.94     30000
   macro avg       0.56      0.86      0.58     30000
weighted avg       0.99      0.94      0.96     30000



In [17]:
target_balance = y_train_smote.value_counts()
print(target_balance)

is_fraud
0    69312
1    69312
Name: count, dtype: int64
