# Task-2 : Credit Card Fraud Detection

> Description: Build a model to detect fraudulent credit card transactions. Use a
dataset containing information about credit card transactions, and
experiment with algorithms like Logistic Regression, Decision Trees,
or Random Forests to classify transactions as fraudulent or
legitimate

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as plot

In [2]:
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

## Collecting & Unzipping the data

In [3]:
import zipfile
data=zipfile.ZipFile('archive.zip')
data.extractall()
data.close()

## Loading the data

In [4]:
train_data=pd.read_csv('fraudTrain.csv')
test_data=pd.read_csv('fraudTest.csv')

In [5]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [7]:
train_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [8]:
train_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


## Visualization of data

In [9]:
fig = plot.pie(values=train_data['is_fraud'].value_counts(), names=["Genuine","Fraud"] , width=700, height=400, color_discrete_sequence=["skyblue","black"]
             ,title="Fraud vs Genuine transactions")
fig.show()

## Data Cleaning

### Dropping the columns which are noot relevant to predict the fraud transaction

In [10]:
cols = ['Unnamed: 0','cc_num','merchant','trans_num','unix_time','first','last','street','zip']
train_data.drop(columns=cols,inplace=True)
test_data.drop(columns=cols,inplace=True)

In [11]:
train_data.shape,test_data.shape

((1296675, 14), (555719, 14))

### Handling date and time

In [12]:
train_data['trans_date_trans_time']=pd.to_datetime(train_data['trans_date_trans_time'])
train_data['trans_date']=train_data['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
train_data['trans_date']=pd.to_datetime(train_data['trans_date'])
train_data['dob']=pd.to_datetime(train_data['dob'])

In [13]:
test_data['trans_date_trans_time']=pd.to_datetime(test_data['trans_date_trans_time'])
test_data['trans_date']=test_data['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
test_data['trans_date']=pd.to_datetime(test_data['trans_date'])
test_data['dob']=pd.to_datetime(test_data['dob'])

## Preprocessing data

#### Calculating age

In [14]:
test_data['trans_date']

0        2020-06-21
1        2020-06-21
2        2020-06-21
3        2020-06-21
4        2020-06-21
            ...    
555714   2020-12-31
555715   2020-12-31
555716   2020-12-31
555717   2020-12-31
555718   2020-12-31
Name: trans_date, Length: 555719, dtype: datetime64[ns]

In [15]:
test_data['dob']

0        1968-03-19
1        1990-01-17
2        1970-10-21
3        1987-07-25
4        1955-07-06
            ...    
555714   1966-02-13
555715   1999-12-27
555716   1981-11-29
555717   1965-12-15
555718   1993-05-10
Name: dob, Length: 555719, dtype: datetime64[ns]

In [16]:
train_data["age"] = train_data["trans_date"]-train_data["dob"]

In [17]:
train_data['age']=train_data['age']/365.25

In [18]:
test_data["age"] = test_data["trans_date"]-test_data["dob"]

In [19]:
test_data["age"]=test_data["age"]/365.25

In [20]:
train_data['trans_month'] = pd.DatetimeIndex(train_data['trans_date']).month
train_data['trans_year'] = pd.DatetimeIndex(train_data['trans_date']).year

In [21]:
#Calculate distance between merchant and home location
train_data['latitudinal_distance'] = abs(round(train_data['merch_lat']-train_data['lat'],3))
train_data['longitudinal_distance'] = abs(round(train_data['merch_long']-train_data['long'],3))

test_data['latitudinal_distance'] = abs(round(test_data['merch_lat']-test_data['lat'],3))
test_data['longitudinal_distance'] = abs(round(test_data['merch_long']-test_data['long'],3))

In [22]:
#Drop Columns that are not relevant to predicy fraud transaction
drop_columns = ['trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date','state']
train_data.drop(columns=drop_columns,inplace=True)
test_data.drop(columns=drop_columns,inplace=True)

In [23]:
# Convert categorical column gender into numerical 
train_data.gender=train_data.gender.apply(lambda x: 1 if x=="M" else 0)
test_data.gender=test_data.gender.apply(lambda x: 1 if x=="M" else 0)

In [24]:
#One Hot Encoding of Category column
train_data = pd.get_dummies(train_data, columns=['category'], prefix='category')
test_data = pd.get_dummies(test_data, columns=['category'], prefix='category')

test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

In [25]:
train_data.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,age,trans_month,trans_year,latitudinal_distance,longitudinal_distance,category_entertainment,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.97,0,3495,0,30 days 19:32:53.716632443,1,2019,0.068,0.87,False,...,False,False,False,False,True,False,False,False,False,False
1,107.23,0,149,0,40 days 12:44:50.759753593,1,2019,0.271,0.024,False,...,True,False,False,False,False,False,False,False,False,False
2,220.11,1,4154,0,56 days 22:48:02.956878850,1,2019,0.97,0.108,True,...,False,False,False,False,False,False,False,False,False,False
3,45.0,1,1939,0,51 days 23:16:37.946611909,1,2019,0.804,0.447,False,...,False,False,False,False,False,False,False,False,False,False
4,41.96,1,99,0,32 days 18:19:57.535934291,1,2019,0.254,0.83,False,...,False,False,False,False,False,True,False,False,False,False


In [26]:
test_data.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,age,trans_month,trans_year,latitudinal_distance,longitudinal_distance,category_entertainment,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,2.86,1,333497,0,52 days 06:10:35.728952772,0,0,0.02,0.265,False,...,False,False,False,False,False,False,True,False,False,False
1,29.84,0,302,0,30 days 10:13:03.572895277,0,0,0.87,0.476,False,...,False,False,False,False,False,False,True,False,False,False
2,41.28,0,34496,0,49 days 16:00:59.137577002,0,0,0.177,0.66,False,...,False,True,False,False,False,False,False,False,False,False
3,60.05,1,54767,0,32 days 21:48:54.702258727,0,0,0.243,0.064,False,...,False,False,False,False,False,True,False,False,False,False
4,3.19,1,1126,0,64 days 23:03:49.158110883,0,0,0.706,0.868,False,...,False,False,False,False,False,False,False,False,False,True


In [27]:
train_data.to_csv('cleaned_train_data.csv')
test_data.to_csv('cleaned_test_data.csv')

In [28]:
train_data['age']=train_data['age'].dt.days

In [29]:
test_data['age']=test_data['age'].dt.days

## Splitting the data

In [30]:
x_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
x_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud']

## Handling Imbalance data

In [31]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

x_train, y_train = smote.fit_resample(x_train, y_train)

## Standardize data

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

## Model Building

#### 1. Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
lr=LogisticRegression(max_iter=1000)
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print(f'Accuracy : {accuracy_score(y_pred,y_test)}')

Accuracy : 0.0038598644278853163


In [34]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00    553574
           1       0.00      1.00      0.01      2145

    accuracy                           0.00    555719
   macro avg       0.00      0.50      0.00    555719
weighted avg       0.00      0.00      0.00    555719



#### 2. Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

dec = DecisionTreeClassifier(random_state=42)
dec.fit(x_train, y_train)

y_pred = dec.predict(x_test)
print(f'Accuracy : {accuracy_score(y_pred,y_test)}')

Accuracy : 0.9948211236254294


In [36]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.40      0.72      0.52      2145

    accuracy                           0.99    555719
   macro avg       0.70      0.86      0.76    555719
weighted avg       1.00      0.99      1.00    555719



#### 3. Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
print(f'Accuracy : {accuracy_score(y_pred,y_test)}')

Accuracy : 0.9965954016328397


In [38]:
report = classification_report(y_test, y_pred)

print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.54      0.79      0.64      2145

    accuracy                           1.00    555719
   macro avg       0.77      0.90      0.82    555719
weighted avg       1.00      1.00      1.00    555719



## Saving Models

In [40]:
import pickle
pickle.dump(lr,open('logistic_regression_model.pkl','wb'))
pickle.dump(clf,open('random_forest_model.pkl','wb'))
pickle.dump(dec,open('decision_tree_model.pkl','wb'))