In [2]:
import pandas as pd
import numpy as np

# Load the CSV
df = pd.read_csv("experiment_data/Sample1000user_addiction_scores_with_flags.csv")
np.random.seed(42)

# Define categories
statuses = [0, 1]

# Store sampled users
sampled_users = []

for status in statuses:
    # Correct filtering step
    subset = df[df["addicted_core"] == status]
    
    # Sample up to 100 users from each category
    sampled = subset.sample(n=min(100, len(subset)), replace=False)
    
    sampled_users.append(sampled)

# Combine and save
final_sample = pd.concat(sampled_users, ignore_index=True)
final_sample.to_json("experiment_data/sampled_200_per_category.json", orient="records", force_ascii=False, indent=2)

print("Sampled users saved to 'sampled_200_per_category.json'")


Sampled users saved to 'sampled_200_per_category.json'


Sample user's comments

In [3]:
import os
import json
import pandas as pd
sampled_user_set = set(final_sample["user_id"])
matched_comments = []
from pathlib import Path
current_dir = Path.cwd()
#  Move up one level to the project root
project_root = current_dir.parent
BASE_DIR = "youtube_comments_v2"
data_dir=project_root/BASE_DIR
for video_type in ["short", "medium", "long"]:
    type_path = os.path.join(data_dir, video_type)
    if not os.path.isdir(type_path):
        continue
    for subfolder in os.listdir(type_path):
        subfolder_path = os.path.join(type_path, subfolder)
        if not os.path.isdir(subfolder_path):
            continue
        for filename in os.listdir(subfolder_path):
            if not filename.endswith(".json"):
                continue
            video_id = filename.replace(".json", "")
            file_path = os.path.join(subfolder_path, filename)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                comments = data.get("comments", [])
                for comment in comments:
                    author = comment.get("author")
                    if author in sampled_user_set:
                        matched_comments.append({
                            "user_id": author,
                            "video_id": video_id,
                            "comment_text": comment.get("text", ""),
                            "comment_publishedAt": comment.get("publishedAt", ""),
                        })
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
# Merge with addiction info
matched_df = pd.DataFrame(matched_comments)
print("matched_df columns:", matched_df.columns)
print("final_sample columns:", final_sample.columns)
merged_df = matched_df.merge(final_sample, on="user_id", how="left")  # Adds addicted_core, total_score
# Save result as JSON
os.makedirs("experiment_data", exist_ok=True)
merged_df.to_json("experiment_data/sampled200_users_matched_comments.json", orient="records", force_ascii=False, indent=2)
print("Saved to 'experiment_data/sampled200_users_matched_comments.json'")

matched_df columns: Index(['user_id', 'video_id', 'comment_text', 'comment_publishedAt'], dtype='object')
final_sample columns: Index(['user_id', 'salience', 'tolerance', 'mood_modification', 'withdrawal',
       'conflict', 'relapse', 'total_score', 'addicted_core'],
      dtype='object')
Saved to 'experiment_data/sampled200_users_matched_comments.json'


The information of comments' related video 

In [11]:
import os
import json
import pandas as pd

# Load comment data
comments_df = pd.read_json("experiment_data/sampled200_users_matched_comments.json")
# Remove duplicate comments from same user on same video at same timestamp 
comments_df.drop_duplicates(subset=["user_id", "video_id", "comment_publishedAt"], inplace=True)
comments_df.drop(columns=['total_score'])
from pathlib import Path
current_dir = Path.cwd()
#  Move up one level to the project root
project_root = current_dir.parent
BASE_DIR = "youtube_videos"
data_dir=project_root/BASE_DIR
video_metadata = {}
for video_type in ["short", "medium", "long"]:
    type_path = os.path.join(data_dir, video_type)
    for root, _, files in os.walk(type_path):
        for file in files:
            if not file.endswith(".json"):
                continue
            video_id = file.replace(".json", "")
            file_path = os.path.join(root, file)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                snippet = data.get("snippet", {})
                thumbnails = snippet.get("thumbnails", {})
                hashtags = snippet.get("tags", [])  # List of tags
                    # engagement = (video likeCount + video commentCount) / viewCount
                stats = data.get("statistics", {})
                likeCount    = int(stats.get("likeCount",    0))
                commentCount = int(stats.get("commentCount", 0))
                viewCount    = int(stats.get("viewCount",    0) or 0)
                engagement   = (likeCount + commentCount) / viewCount if viewCount > 0 else None
                video_metadata[video_id] = {
                    "video_title": snippet.get("title", ""),
                    "video_length_type": video_type,
                    "text_description": snippet.get("description", ""),
                    "thumbnail_url": thumbnails.get("medium", {}).get("url", ""),
                    "hashtags": ", ".join(hashtags) if isinstance(hashtags, list) else "",
                    "engagement": engagement
                }
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
# Merge metadata into comment DataFrame 
meta_df = pd.DataFrame.from_dict(video_metadata, orient="index").reset_index().rename(columns={"index": "video_id"})
full_df = comments_df.merge(meta_df, on="video_id", how="left")
# Save final result 
full_df.to_json("experiment_data/enriched200_sampled_user_comments.json")
print(" Final enriched comment data saved to 'enriched200_sampled_user_comments.json'")

 Final enriched comment data saved to 'enriched200_sampled_user_comments.json'


In [18]:
import os
import json
import time
import pandas as pd
from openai import OpenAI, APIError, RateLimitError
import re
# CONFIG
client = OpenAI(api_key="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")
MODEL        = "gpt-4.1-mini"
TEMPERATURE  = 0.5
USERS_JSON    = "experiment_data/enriched200_sampled_user_comments.json"
OUTPUT_JSON   = "experiment_data/simulated_user_persona.json"

# LOAD DATA
df_users = pd.read_json(USERS_JSON, encoding="utf-8")
df_users = df_users[[
    "user_id","comment_text","comment_publishedAt",
    "addicted_core",
    "salience","tolerance","mood_modification",
    "withdrawal","conflict","relapse",
    "video_title","video_length_type",
    "text_description","thumbnail_url","hashtags","engagement"
]]

# RESUME CHECK
if os.path.exists(OUTPUT_JSON):
    existing_df = pd.read_csv(OUTPUT_JSON)
    completed = set(zip(existing_df["user_id"], existing_df["video_id"]))
    records = existing_df.to_dict(orient="records")
    print(f"Resuming from {len(completed)} completed records.")
else:
    completed = set()
    records = []
# USER PERSONA BUILDER
def build_user_persona_prompt(row, history_n=25):
    user_id    = row["user_id"]
    addicted   = "Yes" if row["addicted_core"] == 1 else "No"
    comps      = {
        "Salience": row["salience"],
        "Tolerance": row["tolerance"],
        "Mood Modification": row["mood_modification"],
        "Withdrawal": row["withdrawal"],
        "Conflict": row["conflict"],
        "Relapse": row["relapse"],
    }
    comp_line  = ", ".join(f"{k}={v}" for k,v in comps.items())

    hist = (
        df_users[df_users["user_id"] == user_id]
          .sort_values("comment_publishedAt")
          .tail(history_n)
          .to_dict(orient="records")
    )

    lines = []
    for e in hist:
        eng = e.get("engagement")
        eng_str = f"{eng:.3f}" if isinstance(eng, (int, float)) else str(eng)
        lines.append(
            f"- At {e['comment_publishedAt']}, \"{e['comment_text']}\"\n"
            f"    • Title: {e['video_title']}\n"
            f"    • Type: {e['video_length_type']}\n"
            f"    • Description: {e['text_description'][:200]}...\n"
            f"    • Hashtags: {e['hashtags']}\n"
            f"    • Thumbnail URL: {e['thumbnail_url']}\n"
            f"    • Engagement: {eng_str}"
        )


    return f"""
You are an expert at analyzing user behavior and making inferences. Below is a user's recent activity on a short-video platform, including their comments and metadata for each video. The user also completed the Bergen Social Media Addiction Scale (BSMAS), which gives scores from 0 to 5 for each category (higher indicates greater risk).

Your task is to analyze this user's behavioral tendencies and construct a detailed persona by addressing the following:

1. Content Preferences
List the top 3 video themes or genres the user consistently engages with (e.g., ASMR, cooking, challenges).

2. Interaction Style
Describe in 2–3 words how the user's comments typically read (e.g., enthusiastic, critical, humorous).

3. Paragraph-Style Persona Summary
Write a single, 3–4 sentence paragraph that weaves together their main content preferences and interaction style into a coherent user persona.

In your 3–4-sentence paragraph, you could add a brief nod to their BSMAS profile—e.g. "With a high Withdrawal score, they often comment late at night seeking emotional relief." That way the persona itself calls out their most salient addiction-risk traits alongside their content tastes and tone.

User Profile:
- ID: {user_id}
- Addicted: {addicted}
- BSMAS Scores: {comp_line}

Recent Comments (last {len(lines)}):
{chr(10).join(lines)}

Analyze this user's behavior and provide the requested persona breakdown.
""".strip()
from openai import OpenAI, RateLimitError, APIError
import time
import csv
for _, row in df_users.iterrows():
    key = (row["user_id"], row["comment_publishedAt"])
    if key in completed:
        continue  # skip already processed
    prompt = build_user_persona_prompt(row)

    # Retry logic
    for _ in range(3):
        try:
            response = client.chat.completions.create(
                model=MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=TEMPERATURE,
            )
            break  # success, exit retry loop
        except (RateLimitError, APIError) as e:
            print(f"Error: {e}, retrying in 5 seconds...")
            time.sleep(5)
    else:
        print("Failed after 3 retries")
        continue

    persona = response.choices[0].message.content.strip()
    record = {
        "user_id": row["user_id"],
        "video_id": row["video_title"],  # or row.get("video_id", "")
        "persona": persona,
    }
    records.append(record)

    # Save progress incrementally
    pd.DataFrame(records).to_json(OUTPUT_JSON, index=False, encoding="utf-8")
    completed.add(key)

    print(f"Processed {row['user_id']} at {row['comment_publishedAt']}"



Resuming from 21 completed records.
Processed @ii_MnBCT at 2024-07-12T20:09:16Z
Processed @BearerOfLightSonOfGod at 2023-07-14T02:32:15Z


KeyboardInterrupt: 