In [1]:
# Pull in data and save it to a DataFrame

import pandas as pd

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Downloads/fraud_detection.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from IPython.display import display

# 1. Clean the action sequence text if not already cleaned
df['actions_clean'] = df['actions_str'].apply(lambda x: " ".join(x.split()) if isinstance(x, str) else "")

# 2. Generate TF-IDF matrix using 1–3 word n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_tfidf = vectorizer.fit_transform(df['actions_clean'])

# 3. Target variable
y = df['is_fraud']

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# 5. Train decision tree model
clf = DecisionTreeClassifier(max_depth=10, min_samples_leaf=10, random_state=42)
clf.fit(X_train, y_train)

# 6. Evaluate performance
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# 7. Extract most predictive action patterns
feature_importances = clf.feature_importances_
feature_names = vectorizer.get_feature_names_out()
top_indices = np.argsort(feature_importances)[::-1][:10]

top_features_df = pd.DataFrame({
    "N-gram": feature_names[top_indices],
    "Importance": feature_importances[top_indices]
})

# 8. Show results
display(top_features_df)
display(report_df)

Unnamed: 0,N-gram,Importance
0,setmobile administrative setmobile,0.299646
1,api accounts accounts_full,0.213082
2,getposinfo p2b getposinfo,0.118443
3,managealert alerts,0.116202
4,verifycode,0.079941
5,cloneuser cards statement,0.069231
6,api,0.027123
7,accounts_full api accounts,0.015141
8,askmember corporatemanagement cloneuser,0.013761
9,getemployers customer getibankusername,0.010073


Unnamed: 0,precision,recall,f1-score,support
0,0.999046,0.999714,0.99938,20949.0
1,0.0,0.0,0.0,20.0
accuracy,0.99876,0.99876,0.99876,0.99876
macro avg,0.499523,0.499857,0.49969,20969.0
weighted avg,0.998093,0.99876,0.998426,20969.0
