# Credit Card Fraud

# Step 1:
### Importing some basic libraries needed for Machine Learning

In [34]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

# Step 2: 
### Importing the dataset, the dataset has two different files for training and testing

In [35]:
#importing the dataset
fraud_train = pd.read_csv('fraudTrain.csv')
fraud_test = pd.read_csv('fraudTest.csv')

# Step 3:
### Preprocessing the data

In [36]:
#preprocess the data
print(fraud_train.head(10))
print()
print(fraud_test.head(10))

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   
5           5   2019-01-01 00:04:08  4767265376804500   
6           6   2019-01-01 00:04:42    30074693890476   
7           7   2019-01-01 00:05:08  6011360759745864   
8           8   2019-01-01 00:05:18  4922710831011201   
9           9   2019-01-01 00:06:01  2720830304681674   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy 

### CHECKING IF THEIR ARE ANY NULL VALUES 

In [37]:
print(fraud_train.isna().sum())

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [38]:
print(fraud_train.columns.values)

['Unnamed: 0' 'trans_date_trans_time' 'cc_num' 'merchant' 'category' 'amt'
 'first' 'last' 'gender' 'street' 'city' 'state' 'zip' 'lat' 'long'
 'city_pop' 'job' 'dob' 'trans_num' 'unix_time' 'merch_lat' 'merch_long'
 'is_fraud']


In [39]:
fraud_train['merchant'].value_counts()

fraud_Kilback LLC                       4403
fraud_Cormier LLC                       3649
fraud_Schumm PLC                        3634
fraud_Kuhn LLC                          3510
fraud_Boyer PLC                         3493
                                        ... 
fraud_Douglas, DuBuque and McKenzie      775
fraud_Treutel-King                       775
fraud_Medhurst, Labadie and Gottlieb     759
fraud_Reichert-Weissnat                  753
fraud_Hahn, Douglas and Schowalter       727
Name: merchant, Length: 693, dtype: int64

In [40]:
fraud_train['category'].value_counts()

gas_transport     131659
grocery_pos       123638
home              123115
shopping_pos      116672
kids_pets         113035
shopping_net       97543
entertainment      94014
food_dining        91461
personal_care      90758
health_fitness     85879
misc_pos           79655
misc_net           63287
grocery_net        45452
travel             40507
Name: category, dtype: int64

## Step 4:
### Converting categorical to Numerical 

In [41]:
#label encoding 
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
fraud_train['merchant'] = encoder.fit_transform(fraud_train['merchant'])
fraud_train['category'] = encoder.fit_transform(fraud_train['category'])

fraud_test['merchant'] = encoder.fit_transform(fraud_test['merchant'])
fraud_test['category'] = encoder.fit_transform(fraud_test['category'])

## Step 5:
### Feature Selection

In [42]:
X_train  = fraud_train[['cc_num','merchant','category','amt']]
Y_train  = fraud_train['is_fraud']

X_test = fraud_test[['cc_num','merchant','category','amt']]
Y_test  = fraud_test['is_fraud'] 

### smote is used to handle imbalanced dataset, it increases the number of minority cases without affecting the majority cases
since this dataset contains imbalanced number of fraud cases , i.e the number of case detecting fraud is less as compared to non fraud cases 

In [43]:
#applying smote to balance the dataset
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, Y_train)

## Step 6:
### Applying model

In [44]:
# decision tree model
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train_resampled, y_train_resampled)

y_pred2 = tree_model.predict(X_test)

print("Accuracy Score: ", accuracy_score(Y_test, y_pred2))
print("Confusion Matrix:", confusion_matrix(Y_test, y_pred2))
print("Classification Report:", classification_report(Y_test, y_pred2))

Accuracy Score:  0.9772043064930297
Confusion Matrix: [[541465  12109]
 [   559   1586]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.12      0.74      0.20      2145

    accuracy                           0.98    555719
   macro avg       0.56      0.86      0.59    555719
weighted avg       1.00      0.98      0.99    555719



### Summary :
1. Accuracy is high, but this is mainly due to the overwhelming number of non-fraud cases.
2. Recall for fraud: 74% (74% of actual fraud cases were correctly identified)
3. Precision for the fraud class:12% (only 12% of predictions for fraud were correct)

## Step 7:
### Random forest classifier

In [45]:
from sklearn.ensemble import RandomForestClassifier

forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
forest_model.fit(X_train, Y_train)

In [46]:
y_pred_forest = forest_model.predict(X_test)
y_pred_forest

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [47]:
print("Accuracy Score: ", accuracy_score(Y_test, y_pred_forest))
print("Confusion Matrix:", confusion_matrix(Y_test, y_pred_forest))
print("Classification Report:", classification_report(Y_test, y_pred_forest))

Accuracy Score:  0.997523928460247
Confusion Matrix: [[553018    556]
 [   820   1325]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.70      0.62      0.66      2145

    accuracy                           1.00    555719
   macro avg       0.85      0.81      0.83    555719
weighted avg       1.00      1.00      1.00    555719



#### Random forest excellently identifies legitimate cases (class 0 ) 
#### but the precision for class 1 indicates that the model identifies only 70% of total cases as fraud and which are actually fraud 
#### recall for class 1 as 0.62 indicates that out of all fraud cases , model correctly identified 62% as fraud 

## Overall Summary 
1. The Decision Tree model performs well for non-fraud cases but lacks balance when identifying fraud, resulting in low fraud precision and overall F1-score for the fraud class.
2. The Random Forest model outperforms the Decision Tree model in terms of accuracy and fraud detection balance. It offers better fraud detection with higher precision, recall, and F1-score for the fraud class, making it more reliable for identifying fraudulent transactions.