In [1]:
# Import the required libraries and dependencies
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score
import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [2]:
# Read csv file as DataFrame, with the first column as row index
data = pd.read_csv("./Resources/credit_card_transactions.csv")

In [None]:
df2 = data.copy()
# Convert trans_date_trans_time to to datetime and use that to create extra feature columns
# Added columns like dob, ft_age,ft_age_group, etc
df2['trans_date_trans_time'] = pd.to_datetime(df2['trans_date_trans_time'])
df2['dob'] = pd.to_datetime(df2['dob'])
df2['ft_age'] = df2['trans_date_trans_time'].dt.year - df2['dob'].dt.year
def age_segment(age):
    if age < 18:
        return 'Under 18'
    elif 19 <= age <= 28:
        return '19-28'
    elif 29 <= age <= 38:
        return '29-38'
    elif 39 <= age <= 48:
        return '39-48'
    elif 49 <= age <= 58:
        return '49-58'
    elif 59 <= age <= 68:
        return '59-68'
    elif 69 <= age <= 78:
        return '69-78'
    else:
        return '79+'

# Apply age segmentation
df2['ft_age_group'] = df2['ft_age'].apply(age_segment)
df2['ft_trans_hour'] = df2['trans_date_trans_time'].dt.hour
df2['ft_time_of_day'] = df2['trans_date_trans_time'].dt.hour.apply(
    lambda hour: 'Morning' if 6 <= hour < 12 else
    ('Afternoon' if 12 <= hour < 18 else
    ('Evening' if 18 <= hour < 24 else 'Night'))
)
df2['ft_trans_day'] = df2['trans_date_trans_time'].dt.day

# TRANSACTION DAY OF YEAR
df2['ft_trans_day_of_year'] = df2['trans_date_trans_time'].dt.dayofyear

# TRANSACTION MONTH
df2['ft_trans_month'] = df2['trans_date_trans_time'].dt.month

# TRANSACTION YEAR - Add a new column 'ft_trans_year' to represent the year of the transaction
df2['ft_trans_year'] = df2['trans_date_trans_time'].dt.year

# DAY OF WEEK - Add a new column 'day_of_week' to represent the day of the week (0=Monday, 6=Sunday)
df2['ft_day_of_week'] = df2['trans_date_trans_time'].dt.day_name()

# Define the Haversine Formula function that calculates the distance given two latitude/longitude points
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    radius = 6371  # Radius of Earth in kilometers
    return radius * c

# Create a new column for distance
df2['ft_distance_user_merchant'] = haversine(
    df2['lat'], df2['long'],
    df2['merch_lat'], df2['merch_long']
)

# Calculate the average distance of previous transactions for a user and compare the current transaction distance
user_avg_distance = df2.groupby('cc_num')['ft_distance_user_merchant'].transform('mean')
df2['ft_merchant_distance_from_user_mean'] = df2['ft_distance_user_merchant'] - user_avg_distance

merchant_transaction_counts = df2['merchant'].value_counts()
df2['ft_merchant_popularity'] = df2['merchant'].map(merchant_transaction_counts)

user_mean_amt = df2.groupby('cc_num')['amt'].transform('mean')
df2['ft_mean_amt_per_user'] = user_mean_amt

df2['ft_amt_deviation'] = df2['amt'] - df2['ft_mean_amt_per_user']

user_transaction_count = df2['cc_num'].value_counts()
df2['ft_transaction_count_per_user'] = df2['cc_num'].map(user_transaction_count)

fraud_rate_by_state = df2.groupby('state')['is_fraud'].mean()
df2['ft_state_fraud_rate'] = df2['state'].map(fraud_rate_by_state)

df2['ft_transaction_is_recurring'] = df2.duplicated(subset=['cc_num', 'merchant'], keep=False).astype(int)

# Define columns to drop
columns_to_drop = [
    'Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'trans_num', 'lat', 
    'long', 'merch_zipcode', 'ft_age', 'merch_lat', 'merch_long', 'trans_num', 'unix_time', 'zip', 'dob'
]
df2_cleaned = df2.drop(columns=columns_to_drop)

# Step 1: Define the split points
train_val_size = 0.85  # 85% for train + validation
test_size = 0.15       # 15% for testing
# Compute the number of rows for train-validation and test sets
train_val_end = int(len(df2_cleaned) * train_val_size)
# Step 2: Split the data into train-validation and test sets
train_val_data = df2_cleaned.iloc[:train_val_end]
test_data = df2_cleaned.iloc[train_val_end:]
# Step 3: Separate features and target for each set
X_train_val = train_val_data.drop(columns=['is_fraud'])
y_train_val = train_val_data['is_fraud']
X_test = test_data.drop(columns=['is_fraud'])
y_test = test_data['is_fraud']

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42, stratify = y_train_val)

# Define categorical columns to encode
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 
                       'job', 'ft_time_of_day', 'ft_age_group', 'ft_day_of_week']

# Initialize the target encoder
encoder = ce.TargetEncoder(cols=categorical_columns)

# Fit the encoder on X_train using y_train
X_train_encoded = encoder.fit_transform(X_train, y_train)

# Transform X_test using the already fitted encoder
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on X_train_encoded and transform it
X_train_scaled = scaler.fit_transform(X_train_encoded)

# Transform X_test_encoded using the fitted scaler
X_val_scaled = scaler.transform(X_val_encoded)

X_test_encoded.columns

Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'city_pop',
       'job', 'ft_age_group', 'ft_trans_hour', 'ft_time_of_day',
       'ft_trans_day', 'ft_trans_day_of_year', 'ft_trans_month',
       'ft_trans_year', 'ft_day_of_week', 'ft_distance_user_merchant',
       'ft_merchant_distance_from_user_mean', 'ft_merchant_popularity',
       'ft_mean_amt_per_user', 'ft_amt_deviation',
       'ft_transaction_count_per_user', 'ft_state_fraud_rate',
       'ft_transaction_is_recurring'],
      dtype='object')

In [None]:
# Define columns in order of feature importance
columns_to_keep_final = ["amt","ft_merchant_popularity","category","city","ft_trans_hour","ft_amt_deviation","ft_mean_amt_per_user","ft_age_group","merchant","city_pop",
                         "ft_transaction_count_per_user","job","ft_trans_day_of_year","ft_trans_day","gender","ft_distance_user_merchant","ft_time_of_day","ft_day_of_week",
                         "ft_merchant_distance_from_user_mean","state","ft_state_fraud_rate","ft_trans_month","ft_trans_year","ft_transaction_is_recurring"]
# Create copy of the ordered columns
c = columns_to_keep_final

# Create arrays to house the scores
trainsscores = []
valscores = []
testscores = []

# forloop to calculate the scores while dropping the features one by one
for j in range(0, len(columns_to_keep_final)):
    c = c[0:len(columns_to_keep_final)-j]
    X_train_encoded_final = X_train_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')

    X_val_encoded_final = X_val_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')

    X_test_encoded_final = X_test_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')
    
    print(list(c))
    # Initialize the IsolationForest and train it and predict it
    i = IsolationForest(random_state=42)
    i.fit(X_train_encoded_final, y_train)
    i_train_predict = i.predict(X_train_encoded_final)
    i_val_predict = i.predict(X_val_encoded_final)
    i_test_predict = i.predict(X_test_encoded_final)
    # The output of Isolation Forest are -1 and 1 where -1 is the anomaly. Converting -1 to 1 and 1 to 0.
    for k in range(0,len(i_train_predict)):
        if i_train_predict[k] == -1:
            i_train_predict[k] = 1
        else:
            i_train_predict[k] = 0

    for k in range(0,len(i_val_predict)):
        if i_val_predict[k] == -1:
            i_val_predict[k] = 1
        else:
            i_val_predict[k] = 0

    for k in range(0,len(i_test_predict)):
        if i_test_predict[k] == -1:
            i_test_predict[k] = 1
        else:
            i_test_predict[k] = 0
    # Calculate the recall score and put it in the arrays
    trainsscores.append(recall_score(y_train,i_train_predict))
    valscores.append(recall_score(y_val,i_val_predict))
    testscores.append(recall_score(y_test,i_test_predict))

    print(f"Recall Score Train: {recall_score(y_train,i_train_predict)}")
    print(f"Recall Score Validation: {recall_score(y_val,i_val_predict)}")
    print(f"Recall Score Test: {recall_score(y_test,i_test_predict)}")
    print("")

print("Train Scores: ")
for k in range(0, len(trainsscores)):
    print(trainsscores[k])
print("")
print("Val Scores: ")
for k in range(0, len(valscores)):
    print(valscores[k])
print("")
print("Test Scores: ")
for k in range(0, len(testscores)):
    print(testscores[k])


['amt', 'ft_merchant_popularity', 'category', 'city', 'ft_trans_hour', 'ft_amt_deviation', 'ft_mean_amt_per_user', 'ft_age_group', 'merchant', 'city_pop', 'ft_transaction_count_per_user', 'job', 'ft_trans_day_of_year', 'ft_trans_day', 'gender', 'ft_distance_user_merchant', 'ft_time_of_day', 'ft_day_of_week', 'ft_merchant_distance_from_user_mean', 'state', 'ft_state_fraud_rate', 'ft_trans_month', 'ft_trans_year', 'ft_transaction_is_recurring']
Recall Score Train: 0.8345188284518829
Recall Score Validation: 0.8198367859384809
Recall Score Test: 0.884377758164166

['amt', 'ft_merchant_popularity', 'category', 'city', 'ft_trans_hour', 'ft_amt_deviation', 'ft_mean_amt_per_user', 'ft_age_group', 'merchant', 'city_pop', 'ft_transaction_count_per_user', 'job', 'ft_trans_day_of_year', 'ft_trans_day', 'gender', 'ft_distance_user_merchant', 'ft_time_of_day', 'ft_day_of_week', 'ft_merchant_distance_from_user_mean', 'state', 'ft_state_fraud_rate', 'ft_trans_month', 'ft_trans_year']
Recall Score Tra

In [None]:
# Create plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(0, len(trainsscores), 1),
                         y=trainsscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Train Scores",
                         line=dict(color="forestgreen",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(valscores), 1),
                         y=valscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Val Scores",
                         line=dict(color="goldenrod",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(testscores), 1),
                         y=testscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Test Scores",
                         line=dict(color="indianred",width=4)))

fig.update_layout(title="Isolation Forest",
                  xaxis=dict(tickvals=np.arange(0, len(trainsscores), 1)),
                  xaxis_range = [0,23],
                  yaxis_range=[0.70, 0.90],
                  xaxis_title="Feature Dropped",
                  yaxis_title="Score",
                  width = 700,
                  margin=dict(l=0, r=0, b=50, t=50, pad=0),
                  showlegend=True)

fig.show()

In [None]:
# This code is similar to the code for the isolation forest but is used for AdaBoost.
# It takes a while to run so it was run once and the data was recorded and the code was commented out.

# columns_to_keep_final = ["amt","ft_merchant_popularity","category","city","ft_trans_hour","ft_amt_deviation","ft_mean_amt_per_user","ft_age_group","merchant","city_pop",
#                          "ft_transaction_count_per_user","job","ft_trans_day_of_year","ft_trans_day","gender","ft_distance_user_merchant","ft_time_of_day","ft_day_of_week",
#                          "ft_merchant_distance_from_user_mean","state","ft_state_fraud_rate","ft_trans_month","ft_trans_year","ft_transaction_is_recurring"]

# c = columns_to_keep_final

# atrainsscores = []
# avalscores = []
# atestscores = []

# for j in range(0, len(columns_to_keep_final)):
#     c = c[0:len(columns_to_keep_final)-j]
#     X_train_encoded_final = X_train_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')

#     X_val_encoded_final = X_val_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')

#     X_test_encoded_final = X_test_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')



#     print(list(X_train_encoded_final.columns))
#     a = AdaBoostClassifier(random_state=42)
#     a.fit(X_train_encoded_final, y_train)
#     a_train_predict = a.predict(X_train_encoded_final)
#     a_val_predict = a.predict(X_val_encoded_final)
#     a_test_predict = a.predict(X_test_encoded_final)

#     print(f"Recall Score Train: {recall_score(y_train,a_train_predict)}")
#     print(f"Recall Score Validation: {recall_score(y_val,a_val_predict)}")
#     print(f"Recall Score Test: {recall_score(y_test,a_test_predict)}")

#     atrainsscores.append(recall_score(y_train,a_train_predict))
#     avalscores.append(recall_score(y_val,a_val_predict))
#     atestscores.append(recall_score(y_test,a_test_predict))
#     print("")

# print("Train Scores: ")
# for k in range(0, len(atrainsscores)):
#     print(atrainsscores[k])

# print("Val Scores: ")
# for k in range(0, len(avalscores)):
#     print(avalscores[k])

# print("Test Scores: ")
# for k in range(0, len(atestscores)):
#     print(atestscores[k])

In [None]:
#This is the data generated by the above code, hardcoded in this section to prevent having the run 
# the lengthy code above to get the data again.
atrainsscores = [0.520083682,
0.520083682,
0.520083682,
0.520083682,
0.520083682,
0.520083682,
0.520083682,
0.520083682,
0.5035564854,
0.5035564854,
0.5035564854,
0.5190376569,
0.4991631799,
0.5056485356,
0.5056485356,
0.5010460251,
0.4780334728,
0.4780334728,
0.480125523,
0.4723849372,
0.2571129707,
0.1014644351,
0.1577405858,
0.3064853556]

avalscores = [0.5065913371,
0.5065913371,
0.5065913371,
0.5065913371,
0.5065913371,
0.5065913371,
0.5065913371,
0.5065913371,
0.4877589454,
0.4877589454,
0.4877589454,
0.5009416196,
0.475831764,
0.4821092279,
0.4821092279,
0.4733207784,
0.4651600753,
0.4651600753,
0.4626490898,
0.4450721908,
0.2190834903,
0.1155053358,
0.1519146265,
0.3145009416]

atestscores = [0.1385701677,
0.1385701677,
0.1385701677,
0.1385701677,
0.1385701677,
0.1385701677,
0.1385701677,
0.1385701677,
0.1350397176,
0.1350397176,
0.1350397176,
0.1359223301,
0.1350397176,
0.1438658429,
0.1438658429,
0.1394527802,
0.1429832304,
0.1429832304,
0.1253309797,
0.07325684025,
0.0158870256,
0.1226831421,
0.1712268314,
0.3053839365]

# Create plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(0, len(atrainsscores), 1),
                         y=atrainsscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Train Scores",
                         line=dict(color="forestgreen",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(avalscores), 1),
                         y=avalscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Val Scores",
                         line=dict(color="goldenrod",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(atestscores), 1),
                         y=atestscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Test Scores",
                         line=dict(color="indianred",width=4)))

fig.update_layout(title="AdaBoost",
                  xaxis=dict(tickvals=np.arange(0, len(trainsscores), 1)),
                  xaxis_range = [0,23],
                  yaxis_range=[0, 0.60],
                  xaxis_title="Feature Dropped",
                  yaxis_title="Score",
                  width = 700,
                  margin=dict(l=0, r=0, b=50, t=50, pad=0),
                  showlegend=True)

fig.show()

In [None]:
# Create plot to compare the Isolation Forest and AdaBoost results in one plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(0, len(trainsscores), 1),
                         y=trainsscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Isolation Forest Train Scores",
                         line=dict(color="forestgreen",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(valscores), 1),
                         y=valscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Isolation Forest Val Scores",
                         line=dict(color="goldenrod",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(testscores), 1),
                         y=testscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Isolation Forest Test Scores",
                         line=dict(color="indianred",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(atrainsscores), 1),
                         y=atrainsscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="AdaBoost Train Scores",
                         line=dict(color="lime",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(avalscores), 1),
                         y=avalscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="AdaBoost Val Scores",
                         line=dict(color="gold",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(atestscores), 1),
                         y=atestscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="AdaBoost Test Scores",
                         line=dict(color="tomato",width=4)))

fig.update_layout(title="Isolation Forest vs. AdaBoost",
                  xaxis=dict(tickvals=np.arange(0, len(trainsscores), 1)),
                  xaxis_range = [0,23],
                  yaxis_range=[0, 0.90],
                  xaxis_title="Feature Dropped",
                  yaxis_title="Score",
                  width = 700,
                  margin=dict(l=0, r=0, b=50, t=50, pad=0),
                  showlegend=True)

fig.show()

In [None]:
# Run the data against the adaboost to extract feature importances
a = AdaBoostClassifier(random_state=42)
a.fit(X_train_encoded, y_train)
a_train_predict = a.predict(X_train_encoded)
a_val_predict = a.predict(X_val_encoded)
a_test_predict = a.predict(X_test_encoded)
print(f"Recall Score Train: {recall_score(y_train,a_train_predict)}")
print(f"Recall Score Validation: {recall_score(y_val,a_val_predict)}")
print(f"Recall Score Test: {recall_score(y_test,a_test_predict)}")





Recall Score Train: 0.5200836820083682
Recall Score Validation: 0.5065913370998116
Recall Score Test: 0.13857016769638128


In [None]:
# Extract feature importances, sort, and remove 0 value features
f = a.feature_importances_
features = []
values = []
for i,v in enumerate(f):
	features.append(X_train_encoded.columns[i])
	values.append(v)

ll = pd.DataFrame.from_dict({"Features":features,"Values":values})
display(ll.sort_values(by="Values",ascending=False))

ll = pd.DataFrame(ll[ll.Values != 0]).sort_values(by="Values",ascending=False)
ll

Unnamed: 0,Features,Values
2,amt,0.34
9,ft_trans_hour,0.12
1,category,0.12
4,city,0.1
0,merchant,0.08
20,ft_amt_deviation,0.06
10,ft_time_of_day,0.06
18,ft_merchant_popularity,0.04
19,ft_mean_amt_per_user,0.02
12,ft_trans_day_of_year,0.02


Unnamed: 0,Features,Values
2,amt,0.34
1,category,0.12
9,ft_trans_hour,0.12
4,city,0.1
0,merchant,0.08
10,ft_time_of_day,0.06
20,ft_amt_deviation,0.06
18,ft_merchant_popularity,0.04
6,city_pop,0.02
7,job,0.02


In [None]:
# Calculating scores for isolation forest based on the top features from the
# adaboost feature importance list 
columns_to_keep_adaboostomptimized = list(ll["Features"])

iforesttrainscores = []
iforestvalscores = []
iforesttestscores = []
c = columns_to_keep_adaboostomptimized

for j in range(0,len(columns_to_keep_adaboostomptimized)):

    c = columns_to_keep_adaboostomptimized[0:len(columns_to_keep_adaboostomptimized)-j]

    iX_train_encoded = X_train_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')


    iX_val_encoded = X_val_encoded.drop(columns=[col for col in X_val_encoded.columns if col not in c], errors='ignore')


    iX_test_encoded = X_test_encoded.drop(columns=[col for col in X_test_encoded.columns if col not in c], errors='ignore')


    print(list(c))

    i = IsolationForest(random_state=42)
    i.fit(iX_train_encoded, y_train)
    i_train_predict = i.predict(iX_train_encoded)
    i_val_predict = i.predict(iX_val_encoded)
    i_test_predict = i.predict(iX_test_encoded)
    for k in range(0,len(i_train_predict)):
        if i_train_predict[k] == -1:
            i_train_predict[k] = 1
        else:
            i_train_predict[k] = 0

    for k in range(0,len(i_val_predict)):
        if i_val_predict[k] == -1:
            i_val_predict[k] = 1
        else:
            i_val_predict[k] = 0

    for k in range(0,len(i_test_predict)):
        if i_test_predict[k] == -1:
            i_test_predict[k] = 1
        else:
            i_test_predict[k] = 0

    print(f"Recall Score Train: {recall_score(y_train,i_train_predict)}")
    print(f"Recall Score Validation: {recall_score(y_val,i_val_predict)}")
    print(f"Recall Score Test: {recall_score(y_test,i_test_predict)}")
                
    iforesttrainscores.append(recall_score(y_train,i_train_predict))
    iforestvalscores.append(recall_score(y_val,i_val_predict))
    iforesttestscores.append(recall_score(y_test,i_test_predict))
    print("")

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop', 'job', 'ft_trans_day_of_year', 'ft_mean_amt_per_user']
Recall Score Train: 0.8027196652719665
Recall Score Validation: 0.7947269303201506
Recall Score Test: 0.7572815533980582

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop', 'job', 'ft_trans_day_of_year']
Recall Score Train: 0.8171548117154812
Recall Score Validation: 0.8072818581293157
Recall Score Test: 0.7634598411297441

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop', 'job']
Recall Score Train: 0.8071129707112971
Recall Score Validation: 0.7903327055869429
Recall Score Test: 0.7546337157987644

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop']
Recall Sc

In [None]:
# Create plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(0, len(iforesttrainscores), 1),
                         y=iforesttrainscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Train Scores",
                         line=dict(color="forestgreen",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(iforestvalscores), 1),
                         y=iforestvalscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Val Scores",
                         line=dict(color="goldenrod",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(iforesttestscores), 1),
                         y=iforesttestscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Test Scores",
                         line=dict(color="indianred",width=4)))

fig.update_layout(title="Isolation Forest",
                  xaxis=dict(tickvals=np.arange(0, len(iforesttrainscores), 1)),
                  xaxis_range = [0,11],
                  yaxis_range=[0.70, 0.90],
                  xaxis_title="Feature Dropped",
                  yaxis_title="Score",
                  width = 700,
                  margin=dict(l=0, r=0, b=50, t=50, pad=0),
                  showlegend=True)

fig.show()

In [None]:
# Calculate the scores for isolation forest, this time keeping the feature
# that caused the major dip above. ft_time_of_day
f = a.feature_importances_
features = []
values = []
for i,v in enumerate(f):
	features.append(X_train_encoded.columns[i])
	values.append(v)

ll = pd.DataFrame.from_dict({"Features":features,"Values":values})
display(ll.sort_values(by="Values",ascending=False))

ll = pd.DataFrame(ll[ll.Values != 0]).sort_values(by="Values",ascending=False)
ll

columns_to_keep_adaboostomptimized = list(ll["Features"])

iforesttrainscores = []
iforestvalscores = []
iforesttestscores = []
c = columns_to_keep_adaboostomptimized

for j in range(0,len(columns_to_keep_adaboostomptimized)):
    c = columns_to_keep_adaboostomptimized[0:len(columns_to_keep_adaboostomptimized)-j]
    if j > 6:
        c.append(columns_to_keep_adaboostomptimized[(5)])

    iX_train_encoded = X_train_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')

    iX_val_encoded = X_val_encoded.drop(columns=[col for col in X_val_encoded.columns if col not in c], errors='ignore')

    iX_test_encoded = X_test_encoded.drop(columns=[col for col in X_test_encoded.columns if col not in c], errors='ignore')
    print(c)
    i = IsolationForest(random_state=42)
    i.fit(iX_train_encoded, y_train)
    i_train_predict = i.predict(iX_train_encoded)
    i_val_predict = i.predict(iX_val_encoded)
    i_test_predict = i.predict(iX_test_encoded)
    for k in range(0,len(i_train_predict)):
        if i_train_predict[k] == -1:
            i_train_predict[k] = 1
        else:
            i_train_predict[k] = 0

    for k in range(0,len(i_val_predict)):
        if i_val_predict[k] == -1:
            i_val_predict[k] = 1
        else:
            i_val_predict[k] = 0

    for k in range(0,len(i_test_predict)):
        if i_test_predict[k] == -1:
            i_test_predict[k] = 1
        else:
            i_test_predict[k] = 0
    
    print(f"Recall Score Train: {recall_score(y_train,i_train_predict)}")
    print(f"Recall Score Validation: {recall_score(y_val,i_val_predict)}")
    print(f"Recall Score Test: {recall_score(y_test,i_test_predict)}")
    print("")
                
    iforesttrainscores.append(recall_score(y_train,i_train_predict))
    iforestvalscores.append(recall_score(y_val,i_val_predict))
    iforesttestscores.append(recall_score(y_test,i_test_predict))


Unnamed: 0,Features,Values
2,amt,0.34
9,ft_trans_hour,0.12
1,category,0.12
4,city,0.1
0,merchant,0.08
20,ft_amt_deviation,0.06
10,ft_time_of_day,0.06
18,ft_merchant_popularity,0.04
19,ft_mean_amt_per_user,0.02
12,ft_trans_day_of_year,0.02


['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop', 'job', 'ft_trans_day_of_year', 'ft_mean_amt_per_user']
0.8027196652719665
0.7947269303201506
0.7572815533980582

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop', 'job', 'ft_trans_day_of_year']
0.8171548117154812
0.8072818581293157
0.7634598411297441

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop', 'job']
0.8071129707112971
0.7903327055869429
0.7546337157987644

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity', 'city_pop']
0.8213389121338912
0.8135593220338984
0.7811120917917035

['amt', 'category', 'ft_trans_hour', 'city', 'merchant', 'ft_time_of_day', 'ft_amt_deviation', 'ft_merchant_popularity']
0.8213389121338912
0.8

In [None]:
#Create plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=np.arange(0, len(iforesttrainscores), 1),
                         y=iforesttrainscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Train Scores",
                         line=dict(color="forestgreen",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(iforestvalscores), 1),
                         y=iforestvalscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Val Scores",
                         line=dict(color="goldenrod",width=4)))

fig.add_trace(go.Scatter(x=np.arange(0, len(iforesttestscores), 1),
                         y=iforesttestscores,mode="lines+markers",
                         marker = dict(size=[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]),
                         name="Test Scores",
                         line=dict(color="indianred",width=4)))

fig.update_layout(title="Isolation Forest",
                  xaxis=dict(tickvals=np.arange(0, len(iforesttrainscores), 1)),
                  xaxis_range = [0,11],
                  yaxis_range=[0.75, 0.90],
                  xaxis_title="Feature Dropped",
                  yaxis_title="Score",
                  width = 700,
                  margin=dict(l=0, r=0, b=50, t=50, pad=0),
                  showlegend=True)

fig.show()

In [27]:
#Hyperparameter tuning based on the highest performing columns.
#Code was run in chunks for n_estimators ranging from 10 to 10000
#max_samples ranging from 10 to 100000
#max_features ranging from 0.01 to 1.0
#the code below is just the last one ran
best_yeild_columns = ['amt', 'category', 'ft_trans_hour', 'ft_time_of_day']

X_train_encoded_final = X_train_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in best_yeild_columns], errors='ignore')

param_grid = {
    "n_estimators" : [10,100,1000, 10000, 'auto'],
    "max_samples": [10, 100, 500, 5000],
    "max_features": [0.01],
    "random_state" : [42]
}

grid_search = GridSearchCV(
    estimator=IsolationForest(),
    param_grid=param_grid,
    scoring="recall",
    cv=2,   # 3-fold cross-validation
    verbose=3,
    n_jobs=-1  # Use all available cores
)


grid_search.fit(X_train_encoded_final,y_train)
print(grid_search.best_params_)

Fitting 2 folds for each of 20 candidates, totalling 40 fits




8 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\simra\anaconda3\envs\dev\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\simra\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "c:\Users\simra\anaconda3\envs\dev\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\simra\anaconda3\envs\dev\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_co

{'max_features': 0.01, 'max_samples': 10, 'n_estimators': 10, 'random_state': 42}


In [33]:
# test different hyperparameter values 
c = ['amt', 'category', 'ft_trans_hour', 'ft_time_of_day']
display(c)

ifinal_train_encoded = X_train_encoded.drop(columns=[col for col in X_train_encoded.columns if col not in c], errors='ignore')

ifinal_val_encoded = X_val_encoded.drop(columns=[col for col in X_val_encoded.columns if col not in c], errors='ignore')

ifinal_test_encoded = X_test_encoded.drop(columns=[col for col in X_test_encoded.columns if col not in c], errors='ignore')

i5 = IsolationForest(max_features=0.01, max_samples=10, n_estimators=1000, random_state=42)
i5.fit(ifinal_train_encoded, y_train)
i5_train_predict = i5.predict(ifinal_train_encoded)
i5_val_predict = i5.predict(ifinal_val_encoded)
i5_test_predict = i5.predict(ifinal_test_encoded)
for k in range(0,len(i5_train_predict)):
    if i5_train_predict[k] == -1:
        i5_train_predict[k] = 1
    else:
        i5_train_predict[k] = 0

for k in range(0,len(i5_val_predict)):
    if i5_val_predict[k] == -1:
        i5_val_predict[k] = 1
    else:
        i5_val_predict[k] = 0

for k in range(0,len(i5_test_predict)):
    if i5_test_predict[k] == -1:
        i5_test_predict[k] = 1
    else:
        i5_test_predict[k] = 0
            
i5foresttrainscores = (recall_score(y_train,i5_train_predict))
i5forestvalscores = (recall_score(y_val,i5_val_predict))
i5foresttestscores = (recall_score(y_test,i5_test_predict))

print(f"Recall Score Train: {i5foresttrainscores}")
print(f"Recall Score Validation: {i5forestvalscores}")
print(f"Recall Score Test: {i5foresttestscores}")

# 0.9893305439330544
# 0.9868173258003766
# 0.9885260370697264

['amt', 'category', 'ft_trans_hour', 'ft_time_of_day']

Recall Score Train: 0.9922594142259414
Recall Score Validation: 0.9899560577526679
Recall Score Test: 0.9894086496028244


In [None]:
#Create plot for scores for all team models
import plotly.graph_objects as go

models = ["XGBoost", "Stacking Classifier", "Logistic Regression", "Random Forest", "Keras", "CatBoost", "LightGBM", "IsolationForest", "AdaBoost"]

plot = go.Figure(data=[
    go.Bar(
    name = "Train Scores",
    x = models,
    y = [0.947, 0.9774123242, 0.2893, 0.0071, 1, 0.978, 0.818, 0.8345188285, 0.520083682],
    marker_color = 'forestgreen'
), go.Bar(
    name = "Validation Scores",
    x = models,
    y = [0.833, 0.8362989324, 0.262, 0.0063, 1, 0.838, 0.729, 0.8198367859, 0.5065913371],
    marker_color = 'goldenrod'
)]).update_layout(xaxis_title = "Models", yaxis_title = "Score", title = "Model Scores for Train & Validation Data")



plot.show()


In [None]:
# Create plot for scores of all models for feature tuned and hyperparameter tuned
import plotly.graph_objects as go

models = ["XGBoost Feature Tuned<br>(Adaboost for feature<br>importance)",
          "XGBoost Feature Tuned<br>(XGBoost for feature<br>importance) + HyperTuned",
          "Stacking Classifier<br>Feature Tuned<br>(XGBoost for<br>feature importance)",
          "CatBoost Feature Tuned<br>(XGBoost for<br>feature importance)",
          "LightGBM Feature Tuned<br>(XGBoost for<br>feature importance)",
          "IsolationForest Feature Tuned<br>(XGBoost for<br>feature importance)",
          "IsolationForest Feature Tuned<br>(Adaboost for<br>feature importance)",
          ]

plot = go.Figure(data=[
    go.Bar(
    name = "Train Scores",
    x = models,
    y = [0.707, 0.978, 0.9282742223, 0.974, 0.838, 0.8564853556, 0.8914225941422594],
    marker_color = 'forestgreen'
), go.Bar(
    name = "Validation Scores",
    x = models,
    y = [0.689, 0.875, 0.7959667853, 0.845, 0.77, 0.8468298807, 0.8857501569365976],
    marker_color = 'goldenrod'
)]).update_layout(xaxis_title = "Models", yaxis_title = "Score", title = "Optimized Model Scores for Train & Validation Data", width=1735)



plot.show()

In [None]:
# Create plot for test score of all models for feature tuned and hyperparameter tuned
import plotly.graph_objects as go

models = ["XGBoost Feature Tuned<br>(Adaboost for feature<br>importance)",
          "XGBoost Feature Tuned<br>(XGBoost for feature<br>importance) + HyperTuned",
          "Stacking Classifier<br>Feature Tuned<br>(XGBoost for<br>feature importance)",
          "CatBoost Feature Tuned<br>(XGBoost for<br>feature importance)",
          "LightGBM Feature Tuned<br>(XGBoost for<br>feature importance)",
          "IsolationForest Feature Tuned<br>(XGBoost for<br>feature importance)",
          "IsolationForest Feature Tuned<br>(Adaboost for<br>feature importance)"
          ]

plot = go.Figure(data=[
    go.Bar(
    name = "Train Scores",
    x = models,
    y = [0.707, 0.978, 0.9282742223, 0.974, 0.838, 0.8564853556, 0.8914225941422594],
    marker_color = 'forestgreen'
), go.Bar(
    name = "Validation Scores",
    x = models,
    y = [0.689, 0.875, 0.7959667853, 0.845, 0.77, 0.8468298807, 0.8857501569365976],
    marker_color = 'goldenrod'
), go.Bar(
    name = "Test Scores",
    x = models,
    y = [0.69, 0.18, 0.71, 0.499, 0.312, 0.8005295675, 0.8923212709620476],
    marker_color = 'indianred'
)]).update_layout(xaxis_title = "Models", yaxis_title = "Score", title = "Optimized Model Score for Test Data", width=1735)



plot.show()