# Task 9: Credit Card Fraud Detection

Random Forest & Logistic Regression

## 1. Import Libraries

In [49]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import joblib


## 2. Load Dataset

In [50]:

df = pd.read_csv("fraud_data.csv")
df.head()


Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
1,04-01-2019 15:06,Predovic Inc,shopping_net,966.11,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a59185fe1b9ccf21323f581d7477573f,65.468863,-165.473127,1
2,04-01-2019 22:37,Wisozk and Sons,misc_pos,49.61,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,86ba3a888b42cd3925881fa34177b4e0,65.347667,-165.914542,1
3,04-01-2019 23:06,Murray-Smitham,grocery_pos,295.26,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,3a068fe1d856f0ecedbed33e4b5f4496,64.445035,-166.080207,1
4,04-01-2019 23:59,Friesen Lt,health_fitness,18.17,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,891cdd1191028759dc20dc224347a0ff,65.447094,-165.446843,1


## 3. Clean Column Names

In [51]:

df.columns = df.columns.str.strip().str.lower()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14446 entries, 0 to 14445
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trans_date_trans_time  14446 non-null  object 
 1   merchant               14446 non-null  object 
 2   category               14446 non-null  object 
 3   amt                    14446 non-null  float64
 4   city                   14446 non-null  object 
 5   state                  14446 non-null  object 
 6   lat                    14446 non-null  float64
 7   long                   14446 non-null  float64
 8   city_pop               14446 non-null  int64  
 9   job                    14446 non-null  object 
 10  dob                    14446 non-null  object 
 11  trans_num              14446 non-null  object 
 12  merch_lat              14446 non-null  float64
 13  merch_long             14446 non-null  float64
 14  is_fraud               14446 non-null  object 
dtypes:

## 4. Identify Target Column

In [52]:

# Automatically select last column as target (modify if needed)
target = df.columns[-1]
df[target].value_counts()


is_fraud
0                         12600
1                          1844
1"2020-12-24 16:56:24"        1
0"2019-01-01 00:00:44"        1
Name: count, dtype: int64

## 5. Remove Non-Numeric Columns

In [53]:

# Remove non-numeric feature columns but keep the target column
# (some datasets have the target as object dtype, so selecting dtypes on the whole
# dataframe can drop it). We build numeric X and then reattach the target.
X = df.drop(columns=[target])
X = X.select_dtypes(include=["number"])
df = pd.concat([X, df[target]], axis=1)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14446 entries, 0 to 14445
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   amt         14446 non-null  float64
 1   lat         14446 non-null  float64
 2   long        14446 non-null  float64
 3   city_pop    14446 non-null  int64  
 4   merch_lat   14446 non-null  float64
 5   merch_long  14446 non-null  float64
 6   is_fraud    14446 non-null  object 
dtypes: float64(5), int64(1), object(1)
memory usage: 790.1+ KB


## 6. Feature-Target Split

In [54]:
X = df.drop(columns=[target])
y = df[target]

## 7. Train-Test Split

In [55]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## 8. Baseline Model: Logistic Regression

In [56]:

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))


                        precision    recall  f1-score   support

                     0       0.92      0.99      0.96      2504
0"2019-01-01 00:00:44"       0.00      0.00      0.00         1
                     1       0.87      0.48      0.62       385

              accuracy                           0.92      2890
             macro avg       0.60      0.49      0.52      2890
          weighted avg       0.92      0.92      0.91      2890



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 9. Random Forest Model

In [57]:

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))


                        precision    recall  f1-score   support

                     0       0.97      0.98      0.98      2504
0"2019-01-01 00:00:44"       0.00      0.00      0.00         1
                     1       0.89      0.78      0.83       385

              accuracy                           0.96      2890
             macro avg       0.62      0.59      0.60      2890
          weighted avg       0.96      0.96      0.96      2890



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 10. Evaluation Metrics

In [None]:
# Choose an averaging method appropriate for binary vs multiclass targets
# 'binary' is valid only for binary targets; for multiclass use 'weighted' (or 'macro'/'micro').
if len(np.unique(y_test)) == 2:
    avg = 'binary'
else:
    avg = 'weighted'

precision = precision_score(y_test, y_pred_rf, zero_division=0, average=avg)
recall = recall_score(y_test, y_pred_rf, zero_division=0, average=avg)
f1 = f1_score(y_test, y_pred_rf, zero_division=0, average=avg)
print('average used:', avg)
precision, recall, f1

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

## 11. Feature Importance

In [None]:

importances = rf.feature_importances_
features = X.columns

pd.Series(importances, index=features).nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.show()


## 12. Save Model

In [None]:

joblib.dump(rf, "random_forest_fraud_model.pkl")


## Final Outcome

Random Forest outperformed Logistic Regression and is suitable for fraud detection tasks with imbalanced data.