In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [27]:
df = pd.read_csv('Fraud_Detection_Dataset.csv')

In [29]:
df.shape

(51000, 12)

In [82]:
df.sample(10)

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
50184,T2742,3212,3271.89,Online Purchase,22.0,Unknown Device,Los Angeles,1,79,9,Credit Card,0
37146,T37147,2658,3595.83,Online Purchase,12.0,Tablet,New York,3,1,8,Debit Card,0
14257,T14258,1489,2124.12,POS Payment,3.0,Mobile,Seattle,3,75,6,Net Banking,0
6670,T6671,1498,897.42,Online Purchase,15.0,Desktop,Miami,4,31,14,Debit Card,0
35225,T35226,2595,1458.23,Bill Payment,5.0,Desktop,Seattle,0,8,10,Net Banking,0
6080,T6081,3804,1339.13,POS Payment,4.0,Desktop,Chicago,4,98,9,Debit Card,0
19053,T19054,1181,2677.62,Online Purchase,8.0,Desktop,Miami,4,34,13,Debit Card,0
28619,T28620,1670,3138.67,Bank Transfer,23.0,Tablet,San Francisco,3,104,5,Credit Card,0
16598,T16599,4495,3420.31,Bill Payment,15.0,Tablet,Los Angeles,2,84,10,Net Banking,0
38817,T38818,4343,,Online Purchase,,Tablet,Chicago,0,33,11,Credit Card,0


### 1. Parent Entropy

In [20]:
zero_prob = len(df[df['Fraudulent'] == 0]) / df.shape[0]
one_prob = len(df[df['Fraudulent'] == 1]) / df.shape[0]

P_entropy = (- (zero_prob * np.log2(zero_prob)) - (one_prob * np.log2(one_prob)))
print('Parent Entropy: ', P_entropy)

Parent Entropy:  0.28305586081966194


### 2. Children Entropy

In [38]:
df['Device_Used'].unique()

array(['Tablet', 'Mobile', 'Desktop', nan, 'Unknown Device'], dtype=object)

In [84]:
tablet = df[df['Device_Used'] == 'Tablet']
mobile = df[df['Device_Used'] == 'Mobile']
desktop = df[df['Device_Used'] == 'Desktop']
unknown = df[df['Device_Used'] == 'Unknown Device']

In [86]:
def probabilities(device):
    zero_prob = len(device[device['Fraudulent'] == 0]) / device.shape[0]
    one_prob = len(device[device['Fraudulent'] == 1]) / device.shape[0]

    return zero_prob, one_prob

In [88]:
tablet_zero_prob, tablet_one_prob = probabilities(tablet)

In [90]:
tablet_entropy = -(tablet_zero_prob * np.log2(tablet_zero_prob)) - (tablet_one_prob * np.log2(tablet_one_prob))
print('Tablet entropy is: ', tablet_entropy)

Tablet entropy is:  0.27250021083212694


In [92]:
mobile_zero_prob, mobile_one_prob = probabilities(mobile)

In [94]:
mobile_entropy = -(mobile_zero_prob * np.log2(mobile_zero_prob)) - (mobile_one_prob * np.log2(mobile_one_prob))
print('Mobile entropy is: ', mobile_entropy)

Mobile entropy is:  0.29324057697498446


In [96]:
desktop_zero_prob, desktop_one_prob = probabilities(desktop)

desktop_entropy = -(desktop_zero_prob * np.log2(desktop_zero_prob)) - (desktop_one_prob * np.log2(desktop_one_prob))
print('Desktop entropy is: ', desktop_entropy)

Desktop entropy is:  0.27506073663954844


In [98]:
unknown_zero_prob, unknown_one_prob = probabilities(unknown)

unknown_entropy = -(unknown_zero_prob * np.log2(unknown_zero_prob)) - (unknown_one_prob * np.log2(unknown_one_prob))
print('Unknown entropy is: ', unknown_entropy)

Unknown entropy is:  0.28221763445592507


### 3. Calculate Weighted Entropy for children

In [103]:
tablet_weight = len(tablet) / len(df)
mobile_weight = len(mobile) / len(df)
desktop_weight = len(desktop) / len(df)
unknown_weight = len(unknown) / len(df)

weighted_entropy = (tablet_weight * tablet_entropy) + (mobile_weight * mobile_entropy) + (desktop_weight * desktop_entropy) + (unknown_weight * unknown_entropy)

In [105]:
print('Weighted Entropy: ', weighted_entropy)

Weighted Entropy:  0.266720960221134


### 4. Calculate Information Gain

In [110]:
IG = P_entropy - weighted_entropy
print("Information Gain: ", IG)

Information Gain:  0.016334900598527935


### 5. Calculate information gain for all the columns
### The column which has the highest IG, the algorithm will select that column to split the data on