In [1]:
import pandas as pd
import datetime
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.set_option('display.max_colwidth', None)

# Load the data file
data_file = 'fraudTrain.csv'
df = pd.read_csv(data_file, dtype=str)


df.head()

Unnamed: 0,ID,Time,Card Number,merchant,category,Amount,firstName,lastName,trans_num,is_fraud
0,0,1/1/2019 12:00:00 AM,2700000000000000,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,0b242abb623afc578575680df30655b9,0
1,1,1/1/2019 12:00:00 AM,630000000000,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,1f76529f8574734946361c461b024d99,0
2,2,1/1/2019 12:00:00 AM,38900000000000,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,a1a22d70485983eac12b5b88dad1cf95,0
3,3,1/1/2019 12:01:00 AM,3530000000000000,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,6b849c168bdad6f867558c3793159a81,0
4,4,1/1/2019 12:03:00 AM,376000000000000,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,a41d7549acf90789359a9aa5346dcb46,0


In [3]:
num_duplicates = df.duplicated().sum()

# Print the number of duplicates
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


In [4]:
#Print the number of empty cells
missing_values = df.isnull().sum()

print("Missing values per column:")
print(missing_values)

df = df.dropna(subset=['firstName', 'lastName'])
missing_values = df.isnull().sum()

print("Missing values per column:")
print(missing_values)

Missing values per column:
ID             0
Time           0
Card Number    0
merchant       0
category       0
Amount         1
firstName      1
lastName       1
trans_num      1
is_fraud       1
dtype: int64
Missing values per column:
ID             0
Time           0
Card Number    0
merchant       0
category       0
Amount         0
firstName      0
lastName       0
trans_num      0
is_fraud       0
dtype: int64


In [5]:
column_types = df.dtypes

print(column_types)

ID             object
Time           object
Card Number    object
merchant       object
category       object
Amount         object
firstName      object
lastName       object
trans_num      object
is_fraud       object
dtype: object


In [6]:
#Processing "Time" Column

df['Time'] = pd.to_datetime(df['Time'], format='%m/%d/%Y %I:%M:%S %p')

# Extract the day of the week and save it in a new column "Weekday"
df['Weekday'] = df['Time'].dt.strftime('%A')

# Define the function to check time of day
def check_time_of_day(timestamp):
    time_obj = timestamp.time()
    morning_start = datetime.time(6, 0)   
    afternoon_start = datetime.time(12, 0) 
    evening_start = datetime.time(18, 0)   
    night_start = datetime.time(22, 0)     
    if time_obj >= morning_start and time_obj < afternoon_start:
        return "Morning"
    elif time_obj >= afternoon_start and time_obj < evening_start:
        return "Afternoon"
    elif time_obj >= evening_start and time_obj < night_start:
        return "Evening"
    else:
        return "Night"

    
# Apply the function to the "Time" column and create a new column with the result
df['Time of Day'] = df['Time'].apply(check_time_of_day)

df['Month'] = df['Time'].dt.month
df['Year'] = df['Time'].dt.year

df.head()


Unnamed: 0,ID,Time,Card Number,merchant,category,Amount,firstName,lastName,trans_num,is_fraud,Weekday,Time of Day,Month,Year
0,0,2019-01-01 00:00:00,2700000000000000,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,0b242abb623afc578575680df30655b9,0,Tuesday,Night,1,2019
1,1,2019-01-01 00:00:00,630000000000,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,1f76529f8574734946361c461b024d99,0,Tuesday,Night,1,2019
2,2,2019-01-01 00:00:00,38900000000000,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,a1a22d70485983eac12b5b88dad1cf95,0,Tuesday,Night,1,2019
3,3,2019-01-01 00:01:00,3530000000000000,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,6b849c168bdad6f867558c3793159a81,0,Tuesday,Night,1,2019
4,4,2019-01-01 00:03:00,376000000000000,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,a41d7549acf90789359a9aa5346dcb46,0,Tuesday,Night,1,2019


In [7]:
y = np.array(df['is_fraud'])
print(y)

df = df.drop(columns=['ID', 'Time', 'trans_num', 'is_fraud', 'firstName', 'lastName', 'lastName', 'firstName'])
df.head()

['0' '0' '0' ... '0' '0' '0']


Unnamed: 0,Card Number,merchant,category,Amount,Weekday,Time of Day,Month,Year
0,2700000000000000,"fraud_Rippin, Kub and Mann",misc_net,4.97,Tuesday,Night,1,2019
1,630000000000,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Tuesday,Night,1,2019
2,38900000000000,fraud_Lind-Buckridge,entertainment,220.11,Tuesday,Night,1,2019
3,3530000000000000,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Tuesday,Night,1,2019
4,376000000000000,fraud_Keeling-Crist,misc_pos,41.96,Tuesday,Night,1,2019


In [8]:
# Create MinMaxScaler instance
scaler = MinMaxScaler()

# Fit and transform the data
df['Amount'] = scaler.fit_transform(df[['Amount']])
df.head()

Unnamed: 0,Card Number,merchant,category,Amount,Weekday,Time of Day,Month,Year
0,2700000000000000,"fraud_Rippin, Kub and Mann",misc_net,0.000137,Tuesday,Night,1,2019
1,630000000000,"fraud_Heller, Gutmann and Zieme",grocery_pos,0.00367,Tuesday,Night,1,2019
2,38900000000000,fraud_Lind-Buckridge,entertainment,0.007569,Tuesday,Night,1,2019
3,3530000000000000,"fraud_Kutch, Hermiston and Farrell",gas_transport,0.00152,Tuesday,Night,1,2019
4,376000000000000,fraud_Keeling-Crist,misc_pos,0.001415,Tuesday,Night,1,2019


In [None]:
# Define the categorical columns you want to encode
data_df_encoded = data_df_clean.copy()
le = LabelEncoder()

categorical_columns = ['category', 'merchant']  # Replace with your categorical column names

# Apply label encoding to each categorical column
for column in categorical_columns:
    data_df_encoded[column] = le.fit_transform(data_df_clean[column])

# Display the first few rows of the DataFrame after label encoding
data_df_encoded.head()

In [None]:
# Convert a single column (e.g., 'is_fraud') to boolean
data_df_encoded['is_fraud'] = data_df_encoded['is_fraud'].astype('bool')
data_df_encoded['Card Number'] = data_df_encoded['Card Number'].astype('string')
data_df_encoded['trans_num'] = data_df_encoded['trans_num'].astype('string')
# Convert multiple columns (e.g., ['column1', 'column2']) to boolean
# data_df[['column1', 'column2']] = data_df[['column1', 'column2']].astype('bool')

# Display the DataFrame to verify the data type conversion
data_df_encoded.head()
#print(data_df_encoded.dtypes)  # Check data types of columns
