# About this dataset

This dataset is fictional and is trying to simulate real life details. Any similarity to real life cases is purely coincidental.
It has the following columns.

trans_date_trans_time: The date and time of the transaction.

cc_num: credit card number.

merchant: Merchant who was getting paid.

category: In what area does that merchant deal.

amt: Amount of money in American Dollars.

first: first name of the card holder.

last: last name of the card holder.

gender: Gender of the cardholder.Just male and female!

street:Street of card holder residence

city:city of card holder residence

state:state of card holder residence

zip:ZIP code of card holder residence

lat:latitude of card holder

long:longitude of card holder

city_pop:Population of the city

job:trade of the card holder

dob:Date of birth of the card holder

trans_num: Transaction ID

unix_time: Unix time which is the time calculated since 1970 to today.

merch_lat: latitude of the merchant

merch_long:longitude of the merchant

is_fraud: Whether the transaction is fraud(1) or not(0)

In [1]:
import pandas as pd
import numpy as np


In [2]:
df_train=pd.read_csv("Dataset/fraudTrain.csv")
df_test=pd.read_csv("Dataset/fraudTest.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ml_lab/Assignment/fraudTrain.csv'

In [None]:
df_train.head(5)

In [None]:
# Drop unnecessary columns
df_train = df_train.drop('Unnamed: 0', axis=1)
df_test = df_test.drop('Unnamed: 0', axis=1)


In [None]:
df_train.info()

In [None]:
# Check columns information
pd.set_option('display.max_columns', 200)
df_train.describe(include="all")

In [None]:
fraud = df_train[df_train['is_fraud'] == 1]
print(fraud['cc_num'].count())
print(fraud['cc_num'].nunique())
print(fraud['amt'].describe(), '\n')
print(fraud['category'].value_counts())

In [None]:
# Checking the nature of data set: balanced or imbalanced?
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (8,5))
df_train.is_fraud.value_counts(normalize = True).plot(kind='bar', color= ['darkorange','steelblue'], alpha = 0.9, rot=0)
plt.title('Fraud Indicator (0) and (1) in the Dataset')
plt.show()

# Label Encoding of Categorical Variables

In [None]:
# Train dataset
from sklearn.preprocessing import LabelEncoder
lencoders = {}
for col in df_train.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    df_train[col] = lencoders[col].fit_transform(df_train[col])

In [None]:
# Test dataset
lencoders_t = {}
for col in df_test.select_dtypes(include=['object']).columns:
    lencoders_t[col] = LabelEncoder()
    df_test[col] = lencoders_t[col].fit_transform(df_test[col])

# Building model

In [None]:
X_train=df_train.drop('is_fraud', axis=1)
Y_train=df_train['is_fraud']

X_test=df_test.drop('is_fraud', axis=1)
Y_test=df_test['is_fraud']


In [None]:
# Normalize Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# Function for measuring performance

import time
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
def run_model(model, X_train, y_train, X_test, y_test, verbose=True):
    t0=time.time()
    if verbose == False:
        model.fit(X_train,y_train.ravel(), verbose=0)
    else:
        model.fit(X_train,y_train.ravel())
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    time_taken = time.time()-t0
    print("Accuracy = {}".format(accuracy))
    print("Time taken = {}".format(time_taken))


    return model, accuracy, time_taken

## Model-1: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

params_lr = {'penalty': 'elasticnet', 'l1_ratio':0.5, 'solver': 'saga'}

model_lr = LogisticRegression(**params_lr)
model_lr, accuracy_lr, tt_lr = run_model(model_lr, X_train, Y_train, X_test, Y_test)

## Model-2: Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
params_dt = {'max_depth': 12,
             'max_features': "sqrt"}

model_dt = DecisionTreeClassifier(**params_dt)
model_dt, accuracy_dt, tt_dt = run_model(model_dt, X_train, Y_train, X_test, Y_test)

## Model-3: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

params_rf = {'max_depth': 16,
             'min_samples_leaf': 1,
             'min_samples_split': 2,
             'n_estimators': 100,
             'random_state': 12345}

model_rf = RandomForestClassifier(**params_rf)
model_rf, accuracy_rf, tt_rf = run_model(model_rf, X_train, Y_train, X_test, Y_test)

## Model-4 Support vector machine