2_1_user_cards.ipynb

This script builds **user cards** for a binary classification task (*influencer vs observer*).

- Aggregates multiple tweets per user into a single text card (profile + behavior + tweet samples)
- Includes profile metadata (bio, location, account age, friends/followers via quoted_status)
- Adds behavioral stats (share of original tweets vs replies)
- Samples 2–3 representative tweets per card
- Uses a multi-card strategy for users with many tweets (1–2 cards per user)
- Cleans URLs to compact placeholders like `[LINK: domain]` to reduce noise

These user cards are then used as inputs to an encoder for user-level classification.


In [1]:
import os
import re
import numpy as np
import pandas as pd
from pandas import json_normalize
from urllib.parse import urlparse
from collections import Counter

# config
SEED = 42
np.random.seed(SEED)

# paths
ROOT_DIR = "."
OUT_DIR = os.path.join(ROOT_DIR, "intermediate")
os.makedirs(OUT_DIR, exist_ok=True)

# inputs
TRAIN_PATH = os.path.join(ROOT_DIR, "train.jsonl")
TEST_PATH = os.path.join(ROOT_DIR, "kaggle_test.jsonl")

# outputs
OUT_TRAIN = os.path.join(OUT_DIR, "user_cards_train.csv")
OUT_TEST = os.path.join(OUT_DIR, "user_cards_test.csv")

# twitter dates come in this specific format: "Wed Oct 10 20:19:24 +0000 2018"
TWITTER_DT_FORMAT = "%a %b %d %H:%M:%S %z %Y"

# card gen params

# how many tweets to show the model per single card
TWEETS_PER_CARD = 3
BIO_MAX_LEN = 160 # standard twitter bio limit
TWEET_MAX_LEN = 320 # roughly double the old limit to be safe
PROMPT_MAX_TOKENS = 512
MIN_TWEETS_FOR_TWO_CARDS = 5 # if user has enough data, we split into 2 cards for augmentation
LABEL_COL = "label"

# utils

# pre-compile regex for whitespace cleanup to speed up processing
_re_ws = re.compile(r"\s+")


def sanitize_text(s, max_len=None):
    """
    performs basic text cleanup by collapsing multiple spaces and handling null inputs
    optionally truncates the result if a max length is provided
    """
    if not isinstance(s, str): # safety check for non-string junk data
        s = "" if s is None else str(s)

    cleaned = _re_ws.sub(" ", s).strip() # replaces newlines/tabs with single space

    if max_len is not None: # hard cut if text exceeds limit
        cleaned = cleaned[:max_len]

    return cleaned


def extract_domain(url):
    """
    extracts the main domain name from a given url string
    returns none if the url is invalid or too short
    """
    # basic length check to filter out broken strings
    if not isinstance(url, str) or len(url) < 5:
        return None

    try:
        # uses urlparse to cleanly get the network location
        dom = urlparse(url).netloc

        if dom.startswith("www."):  # strip 'www.' to standardize domains
            return dom[4:].lower()
        elif dom:
            return dom.lower()
        else:
            return None
    except:
        # failsafe for any parsing errors
        return None


def clean_tweet_urls(text):
    """
    scans tweet text and replaces specific urls with a simplified [LINK: domain] tag
    specifically handles t.co links which are uninformative on their own
    """
    if not isinstance(text, str):
        return text

    def replace_url(match):
        """
        internal callback to determine how to replace each found url
        """
        url = match.group(0)

        # t.co is twitter's shortener so it doesn't tell us the destination -> generic tag
        if url.startswith("t.co") or "t.co/" in url:
            return "[LINK]"

        # for real urls, we try to show the domain to give context to the model
        domain = extract_domain(url)
        if domain:
            return f"[LINK: {domain}]"
        else:
            # fallback if domain extraction fails
            return "[LINK]"

    # regex matches standard http/https, bare www, or t.co links
    text = re.sub(
        r'https?://\S+|(?:www\.)\S+|t\.co/\S+',
        replace_url,
        text
    )

    return text


def get_core_tweet_text(row):
    """
    retrieves the actual text content from the tweet object
    prioritizes the 'extended_tweet' field which contains the full un-truncated text
    """
    # check columns in order of information quality
    for col in ["extended_tweet.full_text", "full_text", "text"]:
        # verify the column exists and has valid content
        if col in row and isinstance(row[col], str) and row[col].strip():
            return row[col]

    # return empty if no valid text found
    return ""


def build_user_key(df):
    """
    generates a unique user key based on account creation date
    sorts the dataframe to ensure deterministic processing later
    """
    user_col = "user.created_at"

    # sorting is required for mergesort stability and grouping logic
    df.sort_values(by=[user_col], inplace=True, kind="mergesort")

    # create a string key from the creation date, filling nans
    df["user_key"] = df[user_col].fillna("NA").astype(str)


def majority_label(labels):
    """
    calculates the majority class (0 or 1) for a set of user labels
    handles cases where a user might have conflicting labels in the raw data
    """
    # convert to numeric and drop bad data
    vals = pd.to_numeric(labels, errors="coerce").dropna().values

    if len(vals) > 0: # average the labels and round to get the majority class
        return int(round(vals.mean()))
    else:
        return np.nan


def get_behavioral_type(row):
    """
    classifies a single tweet into one of three behavioral categories
    checks specific twitter api fields to distinguish quotes, replies, and originals
    """
    # quote status takes precedence as it involves sharing content
    if bool(row.get("is_quote_status")) or pd.notna(row.get("quoted_status_id_str")):
        return "quote"

    # check if it is a direct reply to another user
    if pd.notna(row.get("in_reply_to_user_id_str")):
        return "reply"

    # if neither, it is considered original content
    return "original"


def compute_behavior_stats(tweet_types):
    """
    calculates the percentage distribution of original tweets vs replies
    note that quotes are counted in the total but not explicitly returned
    """
    if not tweet_types:
        return {"pct_original": 0.0, "pct_reply": 0.0}

    # efficient counting of types
    counts = Counter(tweet_types)
    total = len(tweet_types)

    n_original = counts.get("original", 0)
    n_reply = counts.get("reply", 0)

    return {
        "pct_original": n_original / total * 100.0,
        "pct_reply": n_reply / total * 100.0,
    }


# card generation

def generate_user_cards(df, is_train=True):
    """
    orchestrates the creation of 'user cards' for the model
    extracts metadata, computes stats, samples tweets, and builds the final prompt
    """
    rows = []

    # pre-calculate tweet types for the entire dataset for efficiency
    df["tweet_type"] = df.apply(get_behavioral_type, axis=1)

    # try to extract a domain from the user's profile url field
    if "user.url" in df.columns:
        df["user_profile_domain"] = df["user.url"].map(extract_domain)
    else:
        df["user_profile_domain"] = None

    # process each user group individually, maintaining original sort order
    for user_key, g in df.groupby("user_key", sort=False):
        # reset index to handle random sampling cleanly
        g = g.reset_index(drop=True)
        n_tweets = len(g)

        #  extract metadata

        def first_non_null(col):
            """helper to grab the first valid value for user-level fields (for each user, only need 1 value
            if some are missing)"""
            if col not in g.columns:
                return None
            vals = g[col].dropna()
            return vals.iloc[0] if len(vals) > 0 else None

        def get_max_int(col):
            """helper to grab the maximum value seen for counters like follower count"""
            if col not in g.columns:
                return 0
            vals = pd.to_numeric(g[col], errors="coerce").dropna()
            return int(vals.max()) if len(vals) > 0 else 0

        # clean up bio and location text
        user_desc = sanitize_text(first_non_null("user.description") or "", max_len=BIO_MAX_LEN)
        user_location = sanitize_text(first_non_null("user.location") or "", max_len=50)

        # gather key activity metrics
        listed_count = get_max_int("user.listed_count")
        statuses_count = get_max_int("user.statuses_count")
        favourites_count = get_max_int("user.favourites_count")

        # retrieve inferred stats from the global quoted mapping (computed in main)
        quoted_friends = first_non_null("user_friends_from_quoted")
        quoted_followers = first_non_null("user_followers_from_quoted")

        #  calc account age
        user_created = first_non_null("user_created_at_dt")
        last_tweet_dt = g["tweet_datetime"].max()

        # calculate age in years to give the model context on account maturity
        age_str = "Unknown"
        if pd.notna(user_created) and pd.notna(last_tweet_dt):
            age_days = (last_tweet_dt - user_created).days
            if age_days >= 0:
                age_str = f"{age_days / 365.25:.1f}y"

        #  profile links
        # dedup domains using a set (user might link same site multiple times)
        domains = {g["user_profile_domain"].iloc[0]} if g["user_profile_domain"].iloc[0] else set()
        links_str = ", ".join(list(domains)[:3]) if domains else ""


        #  behavior stats
        behavior_stats = compute_behavior_stats(g["tweet_type"].tolist())

        #  user label
        # compute the ground truth label only if we are in training mode
        user_label = majority_label(g[LABEL_COL]) if is_train else None

        #  card logic
        # data augmentation: if user has enough tweets, generate 2 different cards
        n_cards = 1 if n_tweets < MIN_TWEETS_FOR_TWO_CARDS else 2

        # determine distribution of tweets across cards
        if n_cards == 1:
            tweets_per_card_list = [min(n_tweets, TWEETS_PER_CARD)]
        else:
            # card 1 is full, card 2 takes the remainder (with a minimum of 2)
            tweets_per_card_list = [
                TWEETS_PER_CARD,
                min(TWEETS_PER_CARD, max(2, n_tweets - TWEETS_PER_CARD))
            ]

        used_indices = set()

        # loop over cards
        for card_id in range(n_cards):
            # filter out tweets already used in previous cards for this user
            available = [i for i in range(n_tweets) if i not in used_indices]

            # failsafe: if we ran out of unique tweets, reset pool
            if not available:
                available = list(range(n_tweets))

            n_select = min(tweets_per_card_list[card_id], len(available))

            # randomly select tweets for this card
            selected_indices = list(np.random.choice(len(available), size=n_select, replace=False))
            selected_indices = [available[i] for i in selected_indices]

            # mark selected tweets as used
            used_indices.update(selected_indices)

            # slice the dataframe to get the specific tweets
            selected_tweets = g.iloc[selected_indices]

            #  build prompt

            lines = [
                "## USER PROFILE ANALYSIS",
                "",
                "### ACCOUNT INFO",
            ]

            if user_desc:
                lines.append(f'Bio: "{user_desc}"')

            if user_location:
                lines.append(f'Location: "{user_location}"')

            # combine numeric stats into a single compact line
            lines.append(
                f"Age: {age_str} | Listed: {listed_count} | Tweets: {statuses_count} | Likes: {favourites_count}"
            )

            # append extra inferred stats if they exist
            extra_parts = []
            if (quoted_friends is not None) and not pd.isna(quoted_friends):
                extra_parts.append(f"Friends: {int(round(quoted_friends))}")
            if (quoted_followers is not None) and not pd.isna(quoted_followers):
                extra_parts.append(f"Followers: {int(round(quoted_followers))}")

            if extra_parts:
                lines.append(" | ".join(extra_parts))

            # append extracted links if any
            if links_str:
                lines.append(f"Links: {links_str}")

            lines.append("")

            # behavioral section showing percentage of original vs replies
            lines.append("### BEHAVIOR PATTERN")
            lines.append(f"Original content: {behavior_stats['pct_original']:.0f}% | Replies: {behavior_stats['pct_reply']:.0f}%")
            lines.append("")

            # sample tweets section
            lines.append("### SAMPLE TWEETS")

            for i, (_, row_t) in enumerate(selected_tweets.iterrows(), 1):
                text = row_t.get("tweet_text_clean", "")

                if isinstance(text, str) and len(text) > TWEET_MAX_LEN:
                    text = text[:TWEET_MAX_LEN]

                # add explicit prefixes to help the model understand context
                tweet_type = row_t.get("tweet_type", "original")
                if tweet_type == "reply":
                    type_prefix = "[Reply] "
                elif tweet_type == "quote":
                    type_prefix = "[Quote] "
                else:
                    type_prefix = "[Original] "

                lines.append(f"{i}. {type_prefix}{text}")

            # final classification question for the model
            lines.extend([
                "",
                "### CLASSIFICATION",
                "Is this user an INFLUENCER (creates content) or OBSERVER (replies/consumes)?",
                "Answer: influencer or observer"
            ])

            # output row

            row = {
                "user_key": user_key,
                "card_id": card_id,
                "prompt": "\n".join(lines),
                "n_tweets_total": n_tweets,
                "n_tweets_in_card": len(selected_indices),
                "n_cards_for_user": n_cards,
                "pct_original": behavior_stats["pct_original"],
                "pct_reply": behavior_stats["pct_reply"],
            }

            if is_train:
                row["user_label"] = user_label

            rows.append(row)

    return pd.DataFrame(rows)


# main

def main():
    """
    executes the full pipeline: loading, preprocessing, card generation, and saving
    """
    print("Build user cards")

    # 1. load data
    print("\n1. Loading data")

    # load jsonl files and flatten nested structures into a dataframe
    train_df = json_normalize(pd.read_json(TRAIN_PATH, lines=True).to_dict(orient="records"))
    test_df = json_normalize(pd.read_json(TEST_PATH, lines=True).to_dict(orient="records"))

    print(f"  Train: {train_df.shape}, Test: {test_df.shape}")

    #  2. preprocess columns
    print("\n2. Preparing columns")

    for df_ in [train_df, test_df]:
        # extract the most relevant text field
        df_["tweet_text_raw"] = df_.apply(get_core_tweet_text, axis=1)

        # clean the text (normalize urls, strip whitespace, truncate)
        df_["tweet_text_clean"] = df_["tweet_text_raw"].apply(
            lambda x: sanitize_text(clean_tweet_urls(x), max_len=TWEET_MAX_LEN)
        )

        # parse tweet timestamps
        df_["tweet_datetime"] = pd.to_datetime(
            df_.get("created_at"),
            format=TWITTER_DT_FORMAT,
            errors="coerce"
        )

        # parse user creation timestamp for age calculation
        df_["user_created_at_dt"] = pd.to_datetime(
            df_.get("user.created_at"),
            format=TWITTER_DT_FORMAT,
            errors="coerce"
        )

    # build keys to group tweets by user
    build_user_key(train_df)
    build_user_key(test_df)


    # 2bis. global map for quoted users

    def build_global_quoted_mapping(train_df, test_df):
        """
        constructs a global lookup table mapping quoted users to their average friend/follower counts
        this helps recover missing metadata for users who appear as quoted content
        """
        required_cols = [
            "quoted_status.user.created_at",
            "quoted_status.user.friends_count",
            "quoted_status.user.followers_count",
        ]

        frames = []

        # iterate over both train and test sets to maximize data coverage
        for df_ in (train_df, test_df):

            if not all(c in df_.columns for c in required_cols):
                continue

            # filter for rows where we actually have quoted user data
            tmp = df_[required_cols].dropna(subset=["quoted_status.user.created_at"]).copy()

            if tmp.empty:
                continue

            # ensure metrics are numeric
            tmp["quoted_status.user.friends_count"] = pd.to_numeric(
                tmp["quoted_status.user.friends_count"], errors="coerce"
            )
            tmp["quoted_status.user.followers_count"] = pd.to_numeric(
                tmp["quoted_status.user.followers_count"], errors="coerce"
            )

            frames.append(tmp)

        if not frames:
            return pd.DataFrame(
                columns=[
                    "quoted_status.user.created_at",
                    "mapped_user_friends_count",
                    "mapped_user_followers_count"
                ]
            ).set_index("quoted_status.user.created_at")

        # combine all frames and aggregate by creation date
        all_tmp = pd.concat(frames, ignore_index=True)

        # we use mean() bc the same user might be quoted at different times
        # with =/= counts --> averaging smoothes these variations
        agg = all_tmp.groupby("quoted_status.user.created_at").agg({
            "quoted_status.user.friends_count": "mean",
            "quoted_status.user.followers_count": "mean",
        }).rename(columns={
            "quoted_status.user.friends_count": "mapped_user_friends_count",
            "quoted_status.user.followers_count": "mapped_user_followers_count",
        })

        return agg


    # build the mapping table once using all available data
    quoted_mapping = build_global_quoted_mapping(train_df, test_df)

    # apply the mapping to fill in gaps
    for df_ in (train_df, test_df):

        if "user.created_at" in df_.columns and not quoted_mapping.empty:

            # map estimated stats based on user creation date match
            df_["user_friends_from_quoted"] = df_["user.created_at"].map(
                quoted_mapping["mapped_user_friends_count"]
            )
            df_["user_followers_from_quoted"] = df_["user.created_at"].map(
                quoted_mapping["mapped_user_followers_count"]
            )

        else:
            # fill with nan if mapping fails
            df_["user_friends_from_quoted"] = np.nan
            df_["user_followers_from_quoted"] = np.nan


    # 3. generate cards
    print("\n3. Generating cards")

    train_cards = generate_user_cards(train_df, is_train=True)
    test_cards = generate_user_cards(test_df, is_train=False)

    print(f"  Train cards: {train_cards.shape}, Test cards: {test_cards.shape}")

    # verify the split of cards per user
    cards_per_user = train_cards.groupby("user_key").size()

    print(f"\n  Distribution: 1 card={sum(cards_per_user == 1)} users, 2 cards={sum(cards_per_user == 2)} users")

    # 4. save

    train_cards.to_csv(OUT_TRAIN, index=False)
    test_cards.to_csv(OUT_TEST, index=False)

    print(f"\n[done] Saved: {OUT_TRAIN}")
    print(f"[done] Saved: {OUT_TEST}")

    # print quick stats on the label balance
    if "user_label" in train_cards.columns:
        unique_users = train_cards.drop_duplicates("user_key")

        n_obs = sum(unique_users['user_label'] == 0)
        n_inf = sum(unique_users['user_label'] == 1)

        print(f"\nLabel distribution: Observer={n_obs}, Influencer={n_inf}")

    print("\n[done] Processing complete!")

if __name__ == "__main__":
    main()

Build user cards

1. Loading data
  Train: (154914, 193), Test: (103380, 191)

2. Preparing columns

3. Generating cards
  Train cards: (61337, 9), Test cards: (40894, 8)

  Distribution: 1 card=55 users, 2 cards=30641 users

[done] Saved: ./intermediate/user_cards_train.csv
[done] Saved: ./intermediate/user_cards_test.csv

Label distribution: Observer=16390, Influencer=14306

[done] Processing complete!


```text
Build user cards

1. Loading data
  Train: (154914, 193), Test: (103380, 191)

2. Preparing columns

3. Generating cards
  Train cards: (61337, 9), Test cards: (40894, 8)

  Distribution: 1 card=55 users, 2 cards=30641 users

[done] Saved: ./intermediate/user_cards_train.csv
[done] Saved: ./intermediate/user_cards_test.csv

Label distribution: Observer=16390, Influencer=14306

[done] Processing complete!
```
