In [12]:
# Imports for data handling
import numpy  as np
import pandas as pd
import datetime


# Imports for machine learning model
from sklearn.model_selection import train_test_split

# Splitting data for training and evalution of the model

In [17]:
dataset = pd.read_csv('../data/original_data.csv', parse_dates=['Timestamp'])
dataset.shape

(5000, 13)

In [18]:
dataset.head(5)

Unnamed: 0,Transaction_ID,Timestamp,Vehicle_Type,FastagID,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Vehicle_Plate_Number,Fraud_indicator
0,1,2023-01-06 11:20:00,Bus,FTG-001-ABC-121,A-101,Express,Large,350,120,"13.059816123454882, 77.77068662374292",65,KA11AB1234,Fraud
1,2,2023-01-07 14:55:00,Car,FTG-002-XYZ-451,B-102,Regular,Small,120,100,"13.059816123454882, 77.77068662374292",78,KA66CD5678,Fraud
2,3,2023-01-08 18:25:00,Motorcycle,,D-104,Regular,Small,0,0,"13.059816123454882, 77.77068662374292",53,KA88EF9012,Not Fraud
3,4,2023-01-09 02:05:00,Truck,FTG-044-LMN-322,C-103,Regular,Large,350,120,"13.059816123454882, 77.77068662374292",92,KA11GH3456,Fraud
4,5,2023-01-10 06:35:00,Van,FTG-505-DEF-652,B-102,Express,Medium,140,100,"13.059816123454882, 77.77068662374292",60,KA44IJ6789,Fraud


This dataset has biggest challenges in data handling compared to the last project.
As you can see there are many text columns and some of them need to be handled to extract the max prediction power possible.

Here are some ideas we will try:
- Instead of split the dataset into aleatory train and test, split using cronological criteria
- Split date and time, maybe frauds could happens more in an certain time
- Create historical features for example: how many frauds this customer had? how many frauds in this place?

# Timestamp analysis

In [None]:
dataset['Date'] = dataset['Timestamp'].dt.date
dataset['Day']  = dataset['Timestamp'].dt.day
dataset['Hour'] = dataset['Timestamp'].dt.hour
dataset.head(5)

Unnamed: 0,Transaction_ID,Timestamp,Vehicle_Type,FastagID,TollBoothID,Lane_Type,Vehicle_Dimensions,Transaction_Amount,Amount_paid,Geographical_Location,Vehicle_Speed,Vehicle_Plate_Number,Date,Hour,Day
0,1,2023-01-06 11:20:00,Bus,FTG-001-ABC-121,A-101,Express,Large,350,120,"13.059816123454882, 77.77068662374292",65,KA11AB1234,2023-01-06,11,6
1,2,2023-01-07 14:55:00,Car,FTG-002-XYZ-451,B-102,Regular,Small,120,100,"13.059816123454882, 77.77068662374292",78,KA66CD5678,2023-01-07,14,7
2,3,2023-01-08 18:25:00,Motorcycle,,D-104,Regular,Small,0,0,"13.059816123454882, 77.77068662374292",53,KA88EF9012,2023-01-08,18,8
3,4,2023-01-09 02:05:00,Truck,FTG-044-LMN-322,C-103,Regular,Large,350,120,"13.059816123454882, 77.77068662374292",92,KA11GH3456,2023-01-09,2,9
4,5,2023-01-10 06:35:00,Van,FTG-505-DEF-652,B-102,Express,Medium,140,100,"13.059816123454882, 77.77068662374292",60,KA44IJ6789,2023-01-10,6,10


In [None]:
# There are only data of 2023
category_counts = dataset['Timestamp'].dt.year.value_counts()
display(category_counts)


Timestamp
2023    5000
Name: count, dtype: int64

In [None]:
# We can see that the data is balanced  along the months.
# An ideia is to split the train data until October and use November + December as test data
category_counts = dataset['Timestamp'].dt.month.value_counts()
display(category_counts)


Timestamp
1     546
3     514
2     511
4     462
5     445
6     418
7     378
8     374
9     363
10    338
11    327
12    324
Name: count, dtype: int64

In [None]:
# Splitting into train and test using the timestamp column
## The idea is to use older data to train and newer data to test, this can bring more confidence that the model is usable with new data
df_sorted = dataset.sort_values(by='Timestamp')

split_date = pd.to_datetime('2023-11-01')
train_df = df_sorted[df_sorted['Timestamp'] < split_date]
test_df = df_sorted[df_sorted['Timestamp'] >= split_date]

In [None]:
y_train = train_df['Fraud_indicator']
X_train = train_df.drop(['Fraud_indicator'], axis=1)

y_test = test_df['Fraud_indicator']
X_test = test_df.drop(['Fraud_indicator'], axis=1)

# Data exploration and cleaning