In [94]:
import pandas as pd
import random
import time
from datetime import datetime
from groq import Groq

In [95]:
# Load existing datasets
users_df = pd.read_csv('./structured/sd254_users.csv')
transactions_df = pd.read_csv('./structured/credit_card_transactions-ibm_v2.csv')

In [96]:
# Read API key
with open('./key.txt', 'r') as file:
    api_key = file.read().strip()

groq_client = Groq(api_key=api_key)  # Make sure to set your API key

In [98]:
# Function to classify users as normal or fraudulent
def classify_users(users_df, transactions_df):
    fraudulent_user_indices = transactions_df[transactions_df['Is Fraud?'] == 'Yes']['User']
    fraudulent_users = set(users_df.loc[fraudulent_user_indices, 'Person'])
    result = {person: 'fraudulent' if person in fraudulent_users else 'normal' for person in users_df['Person']}
    return result

In [None]:
# Function to generate a log entry using the GroqCloud library
def generate_log_entry(user_id, behavior_type):
    messages = [
        {"role": "system", "content": f"Generate a 15-25 log entries for user {user_id} indicating their behavior in a financial app or related activities. Behavior type: {behavior_type}. Make it diverse and interesting where these can be logs are things we track inside webpage/app leading up to the transaction."}
    ]

    # Call Groq to generate text
    chat_completion = groq_client.chat.completions.create(
        messages=messages,
        model="llama3-8b-8192",
    )

    # Assuming the response contains the generated text
    return chat_completion.choices[0].message.content.strip()

In [100]:
def create_user_behavior_logs(users_df, transactions_df, total_users=100):
    user_classes = classify_users(users_df, transactions_df)
    
    # Calculate number of fraudulent and non-fraudulent users
    num_fraudulent = total_users // 10  # 10% fraudulent
    num_non_fraudulent = total_users - num_fraudulent  # Remaining 90% non-fraudulent

    # Separate users by type
    fraudulent_users = [user for user in users_df['Person'] if user_classes[user] == 'fraudulent']
    non_fraudulent_users = [user for user in users_df['Person'] if user_classes[user] == 'normal']

    # Randomly select the required number of users
    selected_fraudulent_users = random.sample(fraudulent_users, min(num_fraudulent, len(fraudulent_users)))
    selected_non_fraudulent_users = random.sample(non_fraudulent_users, min(num_non_fraudulent, len(non_fraudulent_users)))
    
    selected_users = selected_fraudulent_users + selected_non_fraudulent_users
    random.shuffle(selected_users)  # Shuffle to mix fraudulent and non-fraudulent users

    logs = []

    # Request tracking
    request_count = 0
    token_count = 0

    for user_id in selected_users:
        log_entries = []
        
        # Create a variety of behaviors based on user classification
        if user_classes[user_id] == 'fraudulent':
            behaviors = [
                "Attempted to withdraw cash at multiple ATMs in a short time frame.",
                "Used multiple accounts for transactions within the same hour.",
                "Reported a stolen card but continued to make purchases online.",
                "Received a large refund but immediately transferred funds out.",
                "Frequent changes in contact information and shipping addresses.",
                "Conducted several high-value purchases online using stolen credit card information.",
                "Logged in from multiple devices in a short time period, indicating possible account takeover.",
                "Purchased high-ticket items online using different shipping addresses.",
                "Attempted to transfer funds to multiple unverified accounts rapidly.",
                "Used virtual credit cards for transactions without the physical card.",
                "Reported suspicious activity on their account after unauthorized transactions were made.",
                "Consistently failed to verify identity when prompted by customer support."
            ]
            behavior_limit = 1  # Number of log entries for fraudulent users

        else:
            behaviors = [
                "Made a purchase at a grocery store with a loyalty card.",
                "Browsed financial news articles for investment insights.",
                "Set a savings goal in the app and tracked progress weekly.",
                "Shared feedback about a recent purchase in the app.",
                "Engaged with customer support for account verification.",
                "Consistently used the same device for logging in and making purchases.",
                "Regularly monitored account activity for suspicious transactions.",
                "Participated in promotional campaigns to earn rewards.",
                "Saved frequently used payment methods for convenience.",
                "Updated account information regularly to maintain security.",
                "Requested transaction history for budgeting purposes.",
                "Inquired about investment options through the app."
            ]
            behavior_limit = 1  # Number of log entries for normal users

        for _ in range(behavior_limit):
            if request_count >= 30:  # Check for minute limit
                time.sleep(60)  # Sleep for a minute if limit reached
                request_count = 0  # Reset request count after waiting

            log_entry = generate_log_entry(user_id, random.choice(behaviors))
            tokens_used = len(log_entry.split())  # Estimate tokens used
            
            # Approximate token count
            token_count += tokens_used
            
            # Ensure token count does not exceed the limit
            if token_count > 20000:  # Set to 20000 as a safe limit for a minute
                time.sleep(60)  # Sleep for a minute if token limit reached
                token_count = 0  # Reset token count after waiting

            log_entries.append(log_entry)
            request_count += 1  # Increment request count

        logs.append({'user_id': user_id, 'log_entries': log_entries})

    # Split logs into training and validation sets (70/30 split)
    num_training = int(len(logs) * 0.7)
    training_logs = logs[:num_training]
    validation_logs = logs[num_training:]

    # Write output to logs files
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    with open(f'./unstructured/user_behavior_logs_training_{timestamp}.txt', 'w') as f:
        for log in training_logs:
            f.write(f"User ID: {log['user_id']}\n")
            for entry in log['log_entries']:
                f.write(f"- {entry}\n")
            f.write("\n")

    with open(f'./unstructured/user_behavior_logs_validation_{timestamp}.txt', 'w') as f:
        for log in validation_logs:
            f.write(f"User ID: {log['user_id']}\n")
            for entry in log['log_entries']:
                f.write(f"- {entry}\n")
            f.write("\n")

    return logs  # Generate user behavior logs for a specified number of users

In [101]:
# Generate user behavior logs for a specified number of users
user_logs = create_user_behavior_logs(users_df, transactions_df, total_users=60)

In [102]:
print(len(user_logs))

60


In [103]:
user_logs

[{'user_id': 'Alexia Ward',
  'log_entries': ['Here are 15-25 log entries for user Alexia Ward indicating their behavior in a financial app, tracking their progress in setting a savings goal:\n\n**Week 1**\n\n1. Alexia Ward created a new savings goal: Retirement Fund (May 10, 2023, 10:45 AM)\n2. Set a target amount: $10,000 within 12 months (May 10, 2023, 10:45 AM)\n3. Initial balance: $0 (May 10, 2023, 10:45 AM)\n4. Weekly progress: $0 (May 17, 2023, 10:00 AM)\n\n**Week 2**\n\n5. Deposited $500 into the goal (May 14, 2023, 4:15 PM)\n6. Weekly progress: $500 (May 21, 2023, 11:00 AM)\n7. Adjusted target date: 11 months (May 21, 2023, 11:15 AM)\n\n**Week 3**\n\n8. Deposited $200 into the goal (May 21, 2023, 11:30 AM)\n9. Weekly progress: $700 (May 28, 2023, 9:45 AM)\n10. Viewed savings progress report (May 28, 2023, 10:00 AM)\n\n**Week 4**\n\n11. Deposited $300 into the goal (May 28, 2023, 10:15 AM)\n12. Weekly progress: $1,000 (June 4, 2023, 9:30 AM)\n13. Adjusted target date: 10 months

In [None]:
messages = [
   {
       "role": "system", 
       "content": f"""
                     Anonymize this string so there's no pii remaining - for example, if you see a name, replace it with [name]. Here is the string: .
                     **Type:** Transaction Preparation  
**User:** Hazel Robinson  
**Behavior:**  
   - Hazel Robinson frequently viewed and modified her transaction history visibility settings, adjusting filters for date ranges and transaction amounts.  
   - Hazel Robinson initially viewed transaction history for the last 30 days and then repeatedly adjusted the range to 60 days, sometimes filtering for transactions over $50 or $100.  
   - Hazel Robinson changed settings multiple times to hide all transactions, view transactions exceeding certain amounts, and restore original filters.  
   - Hazel Robinson’s actions involved viewing transaction history both with and without filters, sometimes specifying a specific debit card ending in 4344.  
   - Leading up to the transaction, Hazel Robinson had a filter set for viewing the last 30 days of transaction history.
                  """
   }
]

# Call Groq to generate text
chat_completion = groq_client.chat.completions.create(
   messages=messages,
   model="llama3-8b-8192",
)

# Assuming the response contains the generated text
output = chat_completion.choices[0].message.content.strip()
print(output)


In [104]:
# Extract user IDs from generated logs
user_classes = classify_users(users_df, transactions_df)
user_ids_in_logs = [x["user_id"] for x in user_logs]

# # Determine which users are fraudulent and which are non-fraudulent
fraudulent_users = [user_id for user_id in user_ids_in_logs if user_classes.get(user_id) == 'fraudulent']
non_fraudulent_users = [user_id for user_id in user_ids_in_logs if user_classes.get(user_id) == 'normal']

# # Print a summary of fraudulent and non-fraudulent users
print("Summary of User Classifications in Logs:")
print("Fraudulent Users:", fraudulent_users)
print("Non-Fraudulent Users:", non_fraudulent_users)

# # Optionally, print the count for each category
print("\nCount of Users in Logs:")
print(f"Fraudulent Users Count: {len(fraudulent_users)}")
print(f"Non-Fraudulent Users Count: {len(non_fraudulent_users)}")

Summary of User Classifications in Logs:
Fraudulent Users: ['Jose Faraday', 'Alyson Lewis', 'Lillie Mitchell', 'Sadie Cruz', 'Dexter Thomas', 'Jacoby Long']
Non-Fraudulent Users: ['Alexia Ward', 'Samuel Fu', 'Ansley Miller', 'Forrest Wood', 'Merrick Barnes', 'Timothy Anderson', 'Demetrius Walker', 'Brenden Brown', 'Kaia Bennett', 'Edward Rivera', 'Mckinley Young', 'Addilynn Garcia', 'Cole Xi', 'Maleah Morris', 'Jessie Xi', 'Rayan Braun', 'Arianna Clark', 'Kristopher Nelson', 'Khaleesi Green', 'Elias El-Mafouk', 'Heidi Baker', 'Blaine Diaz', 'Douglas Abe', 'Jakob Roberts', 'Chana Myers', 'Kennedy Farhad', 'Selena Thompson', 'Jessica Evans', 'Vivian Rogers', 'Renee Brooks', 'Leighton Sullivan', 'Oliver Reed', 'Thea Harris', 'Joyce Fisher', 'Decker Gomez', 'Kallie Rodriguez', 'Pedro Phillips', 'Jazlyn Anwar', 'Dakota Rivera', 'Harmoni Fox', 'Cain Morgan', 'Yousef Long', 'Hugo Diaz', 'Kaelyn Anwar', 'Brenna Ramirez', 'Butch Brooks', 'Amirah Ryan', 'Danica Watson', 'Brendan Neufelder', 'Cou