In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv(r"C:\Users\23059\OneDrive\Desktop\Amiira\Y3S1\fyp\FraudDetectionData.csv")

### Feature encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of label Encoder
le = LabelEncoder()

# Using .fit_transform function to fit label and return encoded label
label = le.fit_transform(df['type'])

# removing the column 'type' from df as it is of no use now.
df.drop("type", axis=1, inplace=True)

# Appending the array to our dataFrame with column name 'type'
df["type"] = label

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(df['nameDest'])
label
df.drop("nameDest", axis=1, inplace=True)
df["nameDest"] = label

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(df['nameOrig'])
label
df.drop("nameOrig", axis=1, inplace=True)
df["nameOrig"] = label

### Stratified train-test split

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('isFraud', axis=1)
# Separate the target variable
y = df['isFraud']

# Print class distribution before split
print(y.value_counts(normalize=True))

from sklearn.model_selection import train_test_split

# Assuming X contains your features and y contains your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=18)

# Print class distribution after split
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

In [None]:
# Print shape of train and test sets
print("Train set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Test set shape (X_test, y_test):", X_test.shape, y_test.shape)

## Handle outliers in train set

In [None]:
import random
random.seed (11)
import numpy as np
from scipy import stats

# Specify columns with outliers
cols_with_outliers = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

# Specify the number of bootstrapped samples to create per column
num_samples = 50

# Specify the right trimming proportions for each column
trim_props = {'amount': 0.12, 'oldbalanceOrg': 0.24, 'newbalanceOrig': 0.245, 'oldbalanceDest': 0.29, 'newbalanceDest': 0.3}

# Initialize empty dictionaries to store the trimmed means for each column
train_trimmed_means = {}

# Loop over the specified columns
for col_name in cols_with_outliers:
    
    # Check if the trimming proportion for this column is 0
    if trim_props[col_name] == 0:
        # If so, skip this column and move on to the next one
        continue
    
    # Initialize empty lists to store the bootstrapped samples and the trimmed means for the training set
    train_bootstrapped_samples = []
    train_trimmed_means_list = []
    
    # Loop over the number of desired samples
    for i in range(num_samples):
        # Randomly select indices from the column in the training set
        train_sample_indices = np.random.choice(X_train.index, size=len(X_train), replace=True)
        
        # Create a bootstrapped sample by indexing into the column with the selected indices for the training set
        train_sample = X_train.loc[train_sample_indices, col_name]
        
        # Append the bootstrapped samples to the list for the training set
        train_bootstrapped_samples.append(train_sample)
        
        # Calculate the right trimmed mean of the bootstrapped sample for the training set
        train_right_trimmed_mean = np.mean(train_sample[train_sample <= np.percentile(train_sample, 100*(1-trim_props[col_name]))])
        train_trimmed_means_list.append(train_right_trimmed_mean)
        
    # Calculate the mean of the right trimmed means for the training set and add it to the dictionary
    train_trimmed_means[col_name] = np.mean(train_trimmed_means_list)

    # Replace the outliers in the training set with the trimmed means
    X_train.loc[X_train[col_name] > np.percentile(X_train[col_name], 100*(1-trim_props[col_name])), col_name] = train_trimmed_means[col_name]

# Print the trimmed means for each column separately for the training set
print("Train set trimmed means: ", train_trimmed_means)

In [None]:
from scipy.stats import skew
skewness = skew(X_train.amount)
print(skewness)
skewness = skew(X_train.oldbalanceOrg)
print(skewness)
skewness = skew(X_train.newbalanceOrig)
print(skewness)
skewness = skew(X_train.oldbalanceDest)
print(skewness)
skewness = skew(X_train.newbalanceDest)
print(skewness)

### Feature Selection

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier (n_estimators=150,random_state=18)
rfc.fit(X_train,y_train)

## Random Forest Impurity based feature importance

In [None]:
import matplotlib.pyplot as plt

# Get the feature importance values
importance_vals = rfc.feature_importances_

# Sort importance values
indices = np.argsort(importance_vals[::-1])

# Plot the feature importance of the forest
plt.figure()
plt.title("Random Forest Impurity based feature importance")
plt.bar(range(X.shape[1]), importance_vals[indices][::-1])

plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.xlim([-1, X.shape[1]])
plt.ylim([0, 0.2])
plt.tight_layout()
plt.show()

In [None]:
rfc.feature_names_in_ = list(X_train.columns)

## Random Forest feature importance via permutation importance

In [None]:
# from mlxtend.evaluate import feature_importance
# from mlxtend.evaluate import feature_importance_permutation

# imp_vals, imp_all = feature_importance_permutation(
#     predict_method=rfc.predict,
#     X=X_test.values,
#     y=y_test.values,
#     metric='accuracy',
#     num_rounds=50,
#     seed=18
# )

In [None]:
# std = np.std(imp_all, axis=1)
# indices = np.argsort(imp_vals) [::-1]
# plt. figure()
# plt.title("Random Forest feature importance via permutation importance")
# plt.bar(range(X_train.shape[1]), imp_vals[indices], yerr=std[indices])

# plt.xticks(range(X_train.shape[1]), df.columns[1:] [indices], rotation=90)
# plt.xlim([-1, X_train.shape[1]])
# plt.ylim( [0,0.2]

# plt.tight_layout()
# plt. show()

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_breast_cancer
# from mlxtend.evaluate import feature_importance_permutation
# import matplotlib.pyplot as plt


# # Fit a random forest classifier to the training data
# rfc=RandomForestClassifier (n_estimators=150,random_state=18)
# rfc.fit(X_train,y_train)

# # Calculate feature importance using permutation feature importance
# imp_vals, imp_all = feature_importance_permutation(
#     predict_method=rfc.predict,
#     X=X_test.values,
#     y=y_test,
#     metric='accuracy',
#     num_rounds=50,
#     seed=18
# )

# # Get the feature names and sort the importance values
# feat_names = X_test.columns
# indices = np.argsort(imp_vals)[::-1]

# # Plot the feature importance using a bar chart
# plt.figure(figsize=(10,8))
# plt.title("Random Forest Permutation Feature Importance")
# plt.bar(range(X_test.shape[1]), imp_vals[indices])
# plt.xticks(range(X_test.shape[1]), feat_names[indices], rotation=90)
# plt.xlim([-1, X_test.shape[1]])
# plt.ylim([0, 0.2])
# plt.tight_layout()
# plt.show()
