In [None]:
# Importing essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
#from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

#loading
file_path = 'credit_card_transactions.csv'  
df = pd.read_csv(file_path) #.head(5000)  #smaller sample to prevent crashing initially on testing

#initial data overview
print("initial overview:")
print(df.info())
#print("\nstats summarized:")
#print(df.describe())

#luhn Check for CC #s 
#!pip install luhncheck
from luhncheck import is_luhn

#init counters
check_counter = 0
error_counter = 0

#validate CC #s with luhn algo.
for i in df['cc_num']:
    check_counter += 1
    if is_luhn(str(i)) == 'False':
        error_counter += 1

print("Total credit card numbers checked:", df['cc_num'].count())
print("Errors found:", error_counter)
#no errors were found --> remove/drop the cc_num column
df.drop('cc_num', axis=1, inplace=True)
print("\nMemory after dropping cc_num:", df.memory_usage(deep=True).sum() / (1024 * 1024), "MB")
#remove trans_num to use unnamed(aka the first row that basically lists the row#) as the unique transaction ID
#helps save a bunch of memory 
df.drop('trans_num', axis=1, inplace=True)
print("\nMemory after dropping trans_num:", df.memory_usage(deep=True).sum() / (1024 * 1024), "MB")
#remove additional unneeded columns (names and addresses that provide no real value with the other data)
df.drop(['first', 'last', 'street'], axis=1, inplace=True)
print("\nMemory after dropping name/street:", df.memory_usage(deep=True).sum() / (1024 * 1024), "MB")

print("\nMissing Values BEFORE Imputation:\n", df.isnull().sum())
#handling missing data in 'merch_zipcode' with imputation --> REVISIT // INCOMPLETE 
imputer_zip = SimpleImputer(strategy='most_frequent')
df['merch_zipcode'] = imputer_zip.fit_transform(df[['merch_zipcode']])
#print("\nMemory after imputing merch zip:", df.memory_usage(deep=True).sum() / (1024 * 1024), "MB")
print("\nMissing Values After Imputation:\n", df.isnull().sum())

#function to add a not_local flag to determine if further distance causes more fraud --ie Fraud while Traveling 
def calculate_distance(row):
    return np.sqrt((row['lat'] - row['merch_lat'])**2 + (row['long'] - row['merch_long'])**2)

df['not_local'] = df.apply(calculate_distance, axis=1).apply(lambda x: 1 if x > 50 else 0)

#data vis with plotly
df_sample = df.sample(n=10000, random_state=42)  # Sample to reduce rendering time
fig = px.scatter_geo(df_sample, 
                     lat='lat', 
                     lon='long', 
                     hover_name="merchant", 
                     color="is_fraud",
                     title="Transaction Locations - Sample View")
fig.show()