In [1]:
# Pull in data and save it to a DataFrame

import pandas as pd

# Replace this with the path to your CSV file
file_path = '/Users/christopherfrye/Downloads/fraud_detection.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [4]:
import ast

# Step 1: Parse the 'actions_str' field as a list
df['actions_str'] = df['actions_str'].apply(lambda x: x.split() if isinstance(x, str) else x)

# Step 2: Define n-gram extraction
from collections import Counter

def extract_ngrams(action_list, n):
    return list(zip(*[action_list[i:] for i in range(n)]))

def get_top_ngrams(dataframe, n=2, top_n=10):
    ngram_list = []
    for actions in dataframe['actions_str']:
        ngram_list.extend(extract_ngrams(actions, n))
    return Counter(ngram_list).most_common(top_n)

# Step 3: Split into fraud and legit datasets
fraud_df = df[df['is_fraud'] == 1]
legit_df = df[df['is_fraud'] == 0]

# Step 4: Get top bigrams and trigrams
top_legit_bigrams = get_top_ngrams(legit_df, n=2)
top_fraud_bigrams = get_top_ngrams(fraud_df, n=2)
top_legit_trigrams = get_top_ngrams(legit_df, n=3)
top_fraud_trigrams = get_top_ngrams(fraud_df, n=3)

# Step 5: Format results
bigram_df = pd.DataFrame({
    "Legit Bigrams": [">".join(b[0]) for b in top_legit_bigrams],
    "Legit Count": [b[1] for b in top_legit_bigrams],
    "Fraud Bigrams": [">".join(b[0]) for b in top_fraud_bigrams],
    "Fraud Count": [b[1] for b in top_fraud_bigrams],
})

trigram_df = pd.DataFrame({
    "Legit Trigrams": [">".join(t[0]) for t in top_legit_trigrams],
    "Legit Count": [t[1] for t in top_legit_trigrams],
    "Fraud Trigrams": [">".join(t[0]) for t in top_fraud_trigrams],
    "Fraud Count": [t[1] for t in top_fraud_trigrams],
})

from IPython.display import display

display(bigram_df)
display(trigram_df)

Unnamed: 0,Legit Bigrams,Legit Count,Fraud Bigrams,Fraud Count
0,/P2PREGISTRATION/ASKMEMBER>/CALENDAR/AUDITSEAR...,115394,/TRANSACTIONS/SERIES/POST>/LOANS/LIST,257
1,/TRANSACTIONS/SERIES/POST>/LOANS/LIST,96835,/P2PREGISTRATION/ASKMEMBER>/CALENDAR/AUDITSEAR...,251
2,/TEMPLATES/META>/PROFILE/USERPROFILE,94237,/TEMPLATES/META>/PROFILE/USERPROFILE,221
3,/P2PMEMBER/GETP2PMEMBERIDBYACTUALUSERID>/CATEG...,84789,/PROFILE/USERPROFILE>/WEALTH/PARTYID,194
4,/CAMPAIGN/GETBALANCE>/TAXFREE/GETTAXGOAL,74434,/TIMEDEPOSITS/DETAILEDACCOUNTS>/AUTHENTICATION...,189
5,/LOANS/LIST>/TEMPLATES/META,67300,/CAMPAIGN/GETBALANCE>/TAXFREE/GETTAXGOAL,187
6,/PROFILE/USERPROFILE>/WEALTH/PARTYID,67136,/WEALTH/PARTYID>/TIMEDEPOSITS/DETAILEDACCOUNTS,175
7,/TIMEDEPOSITS/DETAILEDACCOUNTS>/AUTHENTICATION...,64992,/CAMPAIGN/GETCAMPAIGNS>/TRANSACTIONS/SERIES/POST,173
8,/WEALTH/PARTYID>/TIMEDEPOSITS/DETAILEDACCOUNTS,61822,/AUTHENTICATION/FASTLOGIN>/ACCOUNTS/ACCOUNTS_FULL,169
9,/TEMPLATES/META>/ACCOUNTS/STATEMENTS,60393,/CARDS/FETCHCARDS>/USERROLE/GETAPPLICATIONROLES,169


Unnamed: 0,Legit Trigrams,Legit Count,Fraud Trigrams,Fraud Count
0,/P2PREGISTRATION/ASKMEMBER>/CALENDAR/AUDITSEAR...,57833,/WEALTH/PARTYID>/TIMEDEPOSITS/DETAILEDACCOUNTS...,136
1,/PROFILE/GETCUSTOMERRESPONSE>/AUTHENTICATION/G...,44745,/PROFILE/USERPROFILE>/WEALTH/PARTYID>/TIMEDEPO...,128
2,/AUTHENTICATION/GETUSERID>/PROFILE/USERPROFILE...,44611,/CAMPAIGN/GETBALANCE>/TAXFREE/GETTAXGOAL>/ACCO...,121
3,/PROFILE/USERPROFILE>/WEALTH/PARTYID>/TIMEDEPO...,41107,/TIMEDEPOSITS/DETAILEDACCOUNTS>/AUTHENTICATION...,119
4,/LOANS/LIST>/TEMPLATES/META>/PROFILE/USERPROFILE,40509,/TRANSACTIONS/SERIES/POST>/LOANS/LIST>/CAMPAIG...,110
5,/WEALTH/PARTYID>/TIMEDEPOSITS/DETAILEDACCOUNTS...,39664,/LOANS/LIST>/CAMPAIGN/GETBALANCE>/TAXFREE/GETT...,109
6,/TIMEDEPOSITS/DETAILEDACCOUNTS>/AUTHENTICATION...,37032,/P2PREGISTRATION/ASKMEMBER>/CALENDAR/AUDITSEAR...,108
7,/STATEMENTS/FETCHSTATEMENTSPRODUCTS>/CALENDAR/...,34843,/CAMPAIGN/GETCAMPAIGNS>/TRANSACTIONS/SERIES/PO...,107
8,/TEMPLATES/META>/PROFILE/USERPROFILE>/WEALTH/P...,33890,/TEMPLATES/META>/PROFILE/USERPROFILE>/WEALTH/P...,101
9,/CALENDAR/AUDITSEARCHV3>/TEMPLATES/META>/ACCOU...,32421,/PROFILE/GETCUSTOMERRESPONSE>/AUTHENTICATION/G...,90
