In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import torch
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
sns.set(rc = {'figure.figsize':(15,8)})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
SEED = 42

if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

## Loading and preprocessing the data and feature extraction

In [None]:
train_file_path = '/kaggle/input/fraud-detection/fraudTrain.csv'
test_file_path = '/kaggle/input/fraud-detection/fraudTest.csv'
train_data = pd.read_csv(train_file_path) # reading the train data
test_data = pd.read_csv(test_file_path) # reading the test data
train_data['split_label'] = 'train' 
test_data['split_label'] = 'test'
train_data['split_label'] = 'train' 
test_data['split_label'] = 'test'
data = pd.concat([train_data,test_data], axis = 0)
data.shape


#feature Extraction
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['trans_date_trans_time'].head(3)
# extract the transaction hour column

# trans_date_trans_time to pandas datetime

data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['trans_date_trans_time'].head(3)

# dob to pandas datetime

data['dob'] = pd.to_datetime(data['dob'])
data['dob'].head(3)
cat_cols = data.select_dtypes(include = 'object').columns # selecting the categorical columns

for col in cat_cols:

  data[col] = data[col].str.lower().str.strip() # cleaning the categorical columns with strings methods

data[cat_cols].nunique().sort_values()

drop_cols = ['Unnamed: 0','street','merchant','zip','first','last','trans_num','job'] # list of columns to be dropped
data.drop(drop_cols, axis =1, inplace = True)
list(data.columns) # printing the remaining columns after dropping

data['trans_hour'] = data['trans_date_trans_time'].dt.hour  # extracting the hour component using the dt accessor

data['trans_hour'].unique() # printing the unique values in the extracted series
# extract transaction month column

data['trans_month'] = data['trans_date_trans_time'].dt.month # extracting the month number component using the dt accessor

data['trans_month'].unique() # printing the unique values in the extracted series
# the extract the boolean if the date of transaction is weekend or not

data['trans_dayofweek'] = data['trans_date_trans_time'].dt.day_name() # extracting the day name component using the dt accessor

data['trans_dayofweek'].unique() # printing the unique values in the extracted series
# lets look at the summary of the creit card transactions

data.groupby(['cc_num'])['cc_num'].count().sort_values(ascending = False).describe().astype(int)
# sort the dataframe on transaction datetime & cc_num

data.sort_values(by = ['cc_num','unix_time'], ascending = True, inplace = True)
# unix_time for the previouse transaction using the shift method in pandas

data['unix_time_prev_trans'] = data.groupby(by = ['cc_num'])['unix_time'].shift(1)
data['unix_time_prev_trans'].fillna(data['unix_time'] - 86400, inplace = True)
data['timedelta_last_trans'] = (data['unix_time'] - data['unix_time_prev_trans'])//60

data['lat_dist_cust_merch'] = (data['lat'] -data['merch_lat']).abs()
data['lat_dist_cust_merch'].head(3)

data['long_dist_cust_merch'] = (data['long'] -data['merch_long']).abs()
data['long_dist_cust_merch'].head(3)

data['prev_merch_lat'] = data.groupby(by = ['cc_num'])['merch_lat'].shift(1) # latitude of the previouse merchant with pandas shift method

data['prev_merch_long'] = data.groupby(by = ['cc_num'])['merch_long'].shift(1) # longitude of the previouse merchant with pandas shift method
# lets look the variable

data['prev_merch_lat'].fillna(data['merch_lat'], inplace = True)

data['prev_merch_long'].fillna(data['merch_long'], inplace = True)

data['lat_dist_prev_merch'] = (data['merch_lat'] - data['prev_merch_lat']).abs() # calculate and convert into absolute value

data['lat_dist_prev_merch'].head(3) # lets look at the newly arrived variable 

data['long_dist_prev_merch'] = (data['merch_long'] -data['prev_merch_long']).abs() # calculate and convert into absolute value

data['long_dist_prev_merch'].head(3) # lets look at the newly arrived variable 

data['dob'].head()

# lets look at the summary of the creit card transactions

data.groupby(['cc_num'])['cc_num'].count().sort_values(ascending = False).describe().astype(int)


## Variable Analysis and Visualization
data['split_label'].value_counts(normalize = True).plot(kind = 'bar');

In [None]:
a = data['is_fraud'].value_counts().rename('count') # count of classes
b = (data['is_fraud'].value_counts(normalize = True)*100).rename('distribution') # normalisation of the classes
tem = pd.concat([a,b], axis = 1)
tem.index = ['genuine','fraud']
tem['distribution'].plot(kind = 'bar', figsize = [10,7]);
tem

In [None]:
data['cust_age'] = (data['trans_date_trans_time'] - data['dob']).astype('timedelta64[Y]') # calculting the age in days and converting it into years

data['cust_age'].head() # lets look at the newly arrived age column

In [None]:
all_trans = data.copy() # create a copy dataset fof the base dataset

all_trans['class'] = all_trans['is_fraud'].map({1:'Fraud',0:'Non_Fraud'}) # mapping the classes 0 & 1 to genuine and fraud

normal = all_trans[data['is_fraud'] == 0] # create the normal transaction dataset

fraud = all_trans[data['is_fraud'] == 1] # create a fruad transaction dataset

In [None]:
plt.figure(figsize = [7,7])
plot_var = all_trans['is_fraud'].value_counts(normalize = True)
plt.pie(plot_var,
        autopct='%1.1f%%',
        labels = ['non_fraud','fraud'], 
        explode = [0.2, 0], 
        shadow = True) # plotting the pie chart
plt.title('Distribution of the Target');

conclusion :

The classes are highly imbalanced, 99.5 % of the class belongs to normal transactions, and only 0.5 % contributes to the fraud transaction.

In [None]:
def stats_by_class(variable):
  stat_grid = all_trans.groupby('class')[variable].agg([np.min,np.max,np.mean,np.median])
  stat_grid = stat_grid.transpose().round(2)
  return stat_grid

In [None]:
def stats_by_var(variable):

  n = (normal[variable].value_counts(normalize = True)*100).round(2).rename('normal')
  f = (fraud[variable].value_counts(normalize = True)*100).round(2).rename('fraud')
  return pd.concat([n,f], axis = 1).transpose()

In [None]:
def plot_box (data, x, y, title , width = 10, height = 7):
  
  plt.figure(figsize = [width,height])
  sns.boxplot(data = data, x = x, y = y)
  plt.title(title);

In [None]:
def normalize_count_by_class(variable, width = 20, height = 7):

  plt.figure(figsize = [width,height])

  normalized_normal = (normal.groupby('class')[variable].value_counts(normalize = True)*100).rename('value').reset_index() # calculate the normalized value for normal transactions 

  normalized_fraud = (fraud.groupby('class')[variable].value_counts(normalize = True)*100).rename('value').reset_index() # calculate the normalized valued for the fraud transactions

  plot_table = pd.concat([normalized_normal.set_index(variable)[['class','value']],
                             normalized_fraud.set_index(variable)[['class','value']]], axis = 0).reset_index()
  
  sns.barplot(data = plot_table, x = variable, y = 'value', hue = 'class')
  plt.title('\nNormalized frequency of the varible < '+variable+' > on both classes\n')
  plt.xticks(rotation = 30);

  summary_table = pd.concat([normalized_normal.set_index(variable)['value'],
                             normalized_fraud.set_index(variable)['value']],
                            axis = 1).reset_index()

  summary_table.columns = [variable, 'normal', 'fraud']
  
  summary_table['diff in %'] = (summary_table['fraud'] - summary_table['normal'])

  summary_table.sort_values(by = 'diff in %', ascending = True, inplace = True)

  del normalized_normal,normalized_fraud,plot_table # delating the temperory varibale created in the function

  print('\nNormalized frequency of < '+variable+' > on both classes and the percentage diffrence\n')
  
  return summary_table

In [None]:
# choosing ggplot as the default plotting style

plt.style.use('ggplot')

In [None]:
# Amount of transactions
plot_box(all_trans,'class','amt','Distribution of Amount vs Class'); # calling the boxplot function

stats_by_class('amt') # calling the stats_by class function

Inferences :

The mean value of the fraud transactins seems to be high ~ 530$ where normal transctions is the 67
from the boxplot it is very clear that the fraud transaction does not have ourlier amount but the a mjority is heavily concentrated with median of 390 which is very high from the normal transactions

In [None]:
# plot gender count vs taget class

plt.figure(figsize=[15,5])

plt.subplot(1,2,1)
normal['gender'].value_counts().plot(kind = 'bar')
plt.title('\nGender Distribution - Normal Transactions\n')

# plot gender count in fraud class

plt.subplot(1,2,2)
fraud['gender'].value_counts().plot(kind = 'bar')
plt.title('\nGender Distribition only on Fraud Transactions\n');

stats_by_var('gender') # calling the stats by var function for the gender variable

Inference :

The fraud transaction is similarly distributed amoung male and female card holders.

In [None]:
plt.figure(figsize = [20,7])

trans_hour_distribution = all_trans.groupby('class')['cust_age'].value_counts(normalize = True).rename('distribution').reset_index() # count by customer age

sns.lineplot(data = trans_hour_distribution, x = 'cust_age', y = 'distribution', hue = 'class') # plotting the line plot with hue = class

plt.xticks(np.arange(10,100,5)); # modifying the xticks

stats_by_class('cust_age') # calling the stats by class function for cust_age varaible

Inference :
The transctions are majorly from people whose age is in the range of 30 - 50.
The Fraud transaction are majorly concentrated for those card holders whose are age ranges between 45 - 60

In [None]:
# hour of transtactions
plt.figure(figsize = [12,7])

trans_hour_distribution = all_trans.groupby('class')['trans_hour'].value_counts(normalize = True).rename('distribution').reset_index()

sns.lineplot(data = trans_hour_distribution, x = 'trans_hour', y = 'distribution', hue = 'class')
plt.xticks(np.arange(0,24,1))

plt.show()

Inferences :

Normal Transactions are distributed similarly over the hours with slight increase from 11th hour and maintatinig the same till 23rd hour
Fraud Transactions are majorly taking place between 21st hour - 04 hour
In other words, fraud transactions are happening in mid-nigh when the geniune card holders are sleeping and unable to get notified on the transaction messages

In [None]:
#delay between transactions
plot_box(all_trans,'class','timedelta_last_trans','Distribution of Delay vs Class')
stats_by_class('timedelta_last_trans')

Inference :

The Successive fraud transactions are quicky happenin compared to the other transactions

In [None]:
#Daywise trasaction analysis
normalize_count_by_class('trans_dayofweek')

In [None]:
#Lat Distance between Customer & Merchant

plot_box(all_trans,'class','lat_dist_cust_merch','Distribution of Lat Distance Between Merchant and Customer'); # calling the boxplot function

Inference :

There is no any observable difference in the overall distribution in the lat diatsnce between the customer and the merchant

In [None]:
plot_box(all_trans,'class','long_dist_cust_merch','Distribution of Long Distance Between Merchant and Customer'); # calling the boxplot function

inference :

There is no any observable difference in the overall distribution in the long distance between the customer and the merchant

In [None]:
plot_box(all_trans,'class','lat_dist_prev_merch','Distribution of Lat Distance Between Merchant and Customer');

Inference :

There is no any observable difference in the overall distribution in the lat distance between the current and the previous merchant

In [None]:
plot_box(all_trans,'class','long_dist_prev_merch','Distribution of Long Distance Between Merchant and Customer');

Inference :

There is no any observable difference in the overall distribution in the lat distance between the current and the previous merchant

In [None]:
## Category of Item/Service Purchased
normalize_count_by_class('category')

Inference :

The normalized percentage of the grocery_pos, shopping_pos, shopping_net, misc_net are higher for frad transactions

In [None]:
normalize_count_by_class('state', width = 25)

In [None]:
##  Correlation Matrix between the variables
fig = plt.figure(figsize=(18,9))
sns.heatmap(all_trans.corr(),cmap='coolwarm', annot=True)
plt.show()

In [None]:
df=pd.read_csv('../input/fraud-detection/fraudTrain.csv')
df1=pd.read_csv('../input/fraud-detection/fraudTest.csv') # for concat of name to class
df.shape
df1.shape 
df.drop_duplicates(inplace=True)
df = df.drop('Unnamed: 0', axis=1)
df['age']=dt.date.today().year-pd.to_datetime(df['dob']).dt.year
df['hour']=pd.to_datetime(df['trans_date_trans_time']).dt.hour
df['daily']=pd.to_datetime(df['trans_date_trans_time']).dt.day
df['day']=pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
df['month']=pd.to_datetime(df['trans_date_trans_time']).dt.month

In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['category_encoded'] = labelencoder.fit_transform(df['category'])
df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
df['city_encoded'] = labelencoder.fit_transform(df['city'])
df['state_encoded'] =labelencoder.fit_transform(df['state'])
df['job_encoded'] = labelencoder.fit_transform(df['job'])

In [None]:
X = df[['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month', 'is_fraud']]
input_features = ['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month']

#### Spliting the training set into training (90%) and validation(10%) set

In [None]:
df_train, df_val = train_test_split(X, test_size=0.1, random_state=42, stratify=X['is_fraud'])

### We scale the data

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(df_train[input_features])

df_train[input_features]=scaler.transform(df_train[input_features])
df_val[input_features]=scaler.transform(df_val[input_features])

In [None]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
X_val = df_val.iloc[:,:-1]
y_val = df_val.iloc[:,-1]

In [None]:
from sklearn import metrics
def performance_assessment(predictions_df, output_feature='is_fraud', 
                           prediction_feature='predictions', rounded=True):
    
    AUC_ROC = metrics.roc_auc_score(predictions_df[output_feature], predictions_df[prediction_feature])
    AP = metrics.average_precision_score(predictions_df[output_feature], predictions_df[prediction_feature])
    
    performances = pd.DataFrame([[AUC_ROC, AP]], 
                           columns=['AUC ROC','Average precision'])
    performances = performances.round(3)
    
    return performances

## Evaluating the model on the testing set

In [None]:
df_test=pd.read_csv('../input/fraud-detection/fraudTest.csv')
df_test.drop_duplicates(inplace=True)
df_test = df_test.drop('Unnamed: 0', axis=1)
df_test['age']=dt.date.today().year-pd.to_datetime(df_test['dob']).dt.year
df_test['hour']=pd.to_datetime(df_test['trans_date_trans_time']).dt.hour
df_test['daily']=pd.to_datetime(df_test['trans_date_trans_time']).dt.day
df_test['day']=pd.to_datetime(df_test['trans_date_trans_time']).dt.dayofweek
df_test['month']=pd.to_datetime(df_test['trans_date_trans_time']).dt.month

In [None]:
labelencoder1 = LabelEncoder()
df_test['category_encoded'] = labelencoder1.fit_transform(df_test['category'])
df_test['gender_encoded'] = labelencoder1.fit_transform(df_test['gender'])
df_test['city_encoded'] = labelencoder1.fit_transform(df_test['city'])
df_test['state_encoded'] =labelencoder1.fit_transform(df_test['state'])
df_test['job_encoded'] = labelencoder1.fit_transform(df_test['job'])

In [None]:
df_test = df_test[['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month', 'is_fraud']]

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(df_test[input_features])

df_test[input_features]=scaler.transform(df_test[input_features])

In [None]:
X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]


In [None]:
df_val

In [None]:
from sklearn.ensemble import IsolationForest

anomalyclassifier = IsolationForest(random_state=SEED, n_estimators=40)
anomalyclassifier.fit(df_train[input_features])

In [None]:
predictions_df_IF = df_val.copy()
predictions_df_IF['predictions_prob'] = -anomalyclassifier.score_samples(df_val[input_features])

In [None]:
threshold = np.percentile(predictions_df_IF['predictions_prob'],95)

In [None]:
predictions_df_IF['predictions'] = [1 if x > threshold else 0 for x in predictions_df_IF['predictions_prob']]
predictions_df_IF

In [None]:
performance_assessment(predictions_df_IF)

## Evaluating the Isolation Forest model on the testing set

In [None]:
predictions_df_IF_test = df_test.copy()
predictions_df_IF_test['predictions_prob'] = -anomalyclassifier.score_samples(df_test[input_features])

In [None]:
predictions_df_IF_test['predictions'] = [1 if x > threshold else 0 for x in predictions_df_IF_test['predictions_prob']]
predictions_df_IF_test

In [None]:
performance_assessment(predictions_df_IF_test)

In [None]:

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, accuracy_score
def classification_report(predictions_df, output_feature='is_fraud', 
                           prediction_feature='predictions', rounded=True):
    

    cnf_matrix= confusion_matrix(predictions_df[output_feature], predictions_df[prediction_feature])
    print("Confusion Matrix:")
    print(cnf_matrix)
    
    accuracy = accuracy_score(predictions_df[output_feature], predictions_df[prediction_feature])
    print("accuracy:", accuracy)
    precision = precision_score(predictions_df[output_feature], predictions_df[prediction_feature])
    print("Precision:", precision)


    recall = recall_score(predictions_df[output_feature], predictions_df[prediction_feature])
    print("Recall:", recall)
    
classification_report(predictions_df_IF_test)

In [None]:
# Select the required columns from df1 and predictions_df_IF_test
test_predictions_df = pd.DataFrame({
    'first': df1['first'],
    'last': df1['last'],
    'cc_num' : df1['cc_num'],
    'trans_num' : df1['trans_num'],
    'merchant' : df1['merchant'],
    'predictions': predictions_df_IF_test['predictions']
})

output_dir = './kaggle_output/submission'
os.makedirs(output_dir, exist_ok=True)

test_predictions_df.to_csv(os.path.join(output_dir, 'fraud.csv'), index=False)

#print(all_predictions_df)