In [37]:
#!pip install emoji catboost

In [38]:

from __future__ import annotations
import typing
import json
import pathlib
import os
import hashlib

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
import emoji

import sklearn
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GroupShuffleSplit

import catboost

from tqdm import tqdm

import IPython
from IPython.display import display

In [39]:
IS_KAGGLE = "KAGGLE_DOCKER_IMAGE" in os.environ

DATASETS = pathlib.Path(
    "."
    if not IS_KAGGLE
    else "/kaggle/input/influencers-or-observers-predicting-social-roles/Kaggle2025"
)

DATASET_TRAIN = DATASETS / "train.jsonl"
DATASET_KAGGLE = DATASETS / "kaggle_test.jsonl"

CACHE_DIR = pathlib.Path(".")

In [40]:
np.random.seed(42)

# Data loading

In [41]:
def load_json(path: pathlib.Path, cache: bool = False) -> pd.DataFrame:
    path_pq = (CACHE_DIR / path.name).with_stem(f"{path.stem}_raw").with_suffix(".parquet")
    
    if cache and path_pq.exists():
        return pd.read_parquet(path_pq)
    
    # This leaves things to be desired, since there's no way to specify dtypes
    # and it assumes float instead of int, causing a loss in precision...
    # But I guess it only matters for ids, which we'll probably discard in preprocessing anyway
    result = pd.json_normalize(list(map(json.loads, path.read_bytes().splitlines())))
    
    if cache:
        result.to_parquet(path_pq)
    
    return result


In [42]:
train_data = load_json(DATASET_TRAIN, cache=True)
kaggle_data = load_json(DATASET_KAGGLE, cache=True)

# Preprocessing

In [47]:
train_data["user.profile_image_url"]

0         http://pbs.twimg.com/profile_images/1366376468...
1         http://pbs.twimg.com/profile_images/1169958485...
2         http://pbs.twimg.com/profile_images/3581457682...
3         http://pbs.twimg.com/profile_images/1325759926...
4         http://pbs.twimg.com/profile_images/1302966540...
                                ...                        
154909    http://pbs.twimg.com/profile_images/1280896070...
154910    http://pbs.twimg.com/profile_images/1197061859...
154911    http://pbs.twimg.com/profile_images/1376673696...
154912    http://pbs.twimg.com/profile_images/1376265062...
154913    http://pbs.twimg.com/profile_images/1361816836...
Name: user.profile_image_url, Length: 154914, dtype: object

In [50]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    # For technical reasons, any text columns we want to use should have no dots in their names.
    # The simplest way to achieve this is to replace all dots indiscriminately.
    
    df = df.rename(columns=lambda x: x.replace(".", "_"))
    
    df["is_reply"] = df["in_reply_to_status_id"].notna()
    
    user_id_key = ["user_description", "user_created_at", "user_profile_image_url"]
    df["user_hash"] = df[user_id_key].fillna("<NA>").astype(str).agg(''.join, axis=1).where(~df[user_id_key].isna().all(axis=1), df["id_str"]).map(fast_hash)
    
    df = df.drop(columns=[
        "in_reply_to_status_id_str",
        # "in_reply_to_status_id",
        "in_reply_to_user_id_str",
        "in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "id_str",
        "quoted_status_in_reply_to_status_id_str",
        "quoted_status_in_reply_to_status_id",
        "quoted_status_in_reply_to_user_id_str",
        "quoted_status_in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "quoted_status_user_id_str",
        "quoted_status_user_id",
        # "quoted_status_permalink_expanded",
        "quoted_status_permalink_display",
        "quoted_status_permalink_url",
        "quoted_status_quoted_status_id",
        "quoted_status_quoted_status_id_str",
        # "quoted_status_place_id",
        # "place_id",
        "lang",  # Always "fr"
        "retweeted",  # Always False
        "filter_level",  # Always "low"
        "geo",  # Always None
        "place",  # Always None
        "coordinates",  # Always None
        "contributors",  # Always None
        "quote_count",  # Always 0
        "reply_count",  # Always 0
        "retweet_count",  # Always 0
        "favorite_count",  # Always 0
        "favorited",  # Always False
        "quoted_status_geo",  # Always None
        "quoted_status_place",  # Always None
        "quoted_status_coordinates",  # Always None
        "quoted_status_retweeted",  # Always False
        "quoted_status_filter_level",  # Always "low"
        "quoted_status_contributors",  # Always None
        "quoted_status_user_utc_offset",  # Always None
        "quoted_status_user_lang",  # Always None
        "quoted_status_user_time_zone",  # Always None
        "quoted_status_user_follow_request_sent",  # Always None
        "quoted_status_user_following",  # Always None
        "quoted_status_user_notifications",  # Always None
        "user_default_profile_image",  # Always False
        "user_protected",  # Always False
        "user_contributors_enabled",  # Always False
        "user_lang",  # Always None
        "user_notifications",  # Always None
        "user_following",  # Always None
        "user_utc_offset",  # Always None
        "user_time_zone",  # Always None
        "user_follow_request_sent",  # Always None
    ])
    
    df["full_text"] = df.apply(lambda tweet: extract_full_text(tweet), axis=1)
    
    source_split = df["source"].str.removeprefix("<a href=\"").str.removesuffix("</a>").str.split("\" rel=\"nofollow\">").map(lambda x: x if len(x) == 2 else pd.NA)
    df["source_url"] = source_split.map(lambda x: x[0], na_action="ignore")
    df["source_name"] = source_split.map(lambda x: x[1], na_action="ignore")
    
    df["misc_text"] = df.apply(
        lambda x: "via: {0}; reply: @{1}; quote: @{2} {3}".format(x["source_name"], x["in_reply_to_screen_name"], x["quoted_status_user_screen_name"], x["quoted_status_user_name"]), axis=1,
    )
    
    dt_cols = ["created_at", "quoted_status_created_at", "quoted_status_user_created_at", "user_created_at"]
    df[dt_cols] = df[dt_cols].apply(pd.to_datetime, format="%a %b %d %H:%M:%S %z %Y", errors="coerce").map(pd.Timestamp.timestamp, na_action="ignore")
    
    return df

def extract_full_text(tweet: pd.Series) -> str:
    text: str = tweet["text"]
    
    if not pd.isna(tweet["extended_tweet_full_text"]):
        text = tweet["extended_tweet_full_text"]
    
    return text

def fast_hash(content: str) -> str:
    h = hashlib.blake2s(digest_size=16)
    h.update(content.encode('utf-8'))
    return h.hexdigest()


In [51]:
X_train_pre = train_data.drop("label", axis=1)
y_train = train_data["label"]

X_kaggle_pre = kaggle_data

X_train_pre = preprocess(X_train_pre)
X_kaggle_pre = preprocess(X_kaggle_pre)

In [52]:
inferred_dtypes = pd.Series({
    col: X_train_pre[col].dropna().convert_dtypes().dtype
    for col in X_train_pre.columns
})


invalid value encountered in cast


invalid value encountered in cast



In [53]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(inferred_dtypes)

in_reply_to_status_id                                           Float64
created_at                                                        Int64
source                                                   string[python]
in_reply_to_screen_name                                  string[python]
is_quote_status                                                 boolean
text                                                     string[python]
truncated                                                       boolean
timestamp_ms                                             string[python]
challenge_id                                                      Int64
quoted_status_extended_tweet_entities_urls                       object
quoted_status_extended_tweet_entities_hashtags                   object
quoted_status_extended_tweet_entities_user_mentions              object
quoted_status_extended_tweet_entities_symbols                    object
quoted_status_extended_tweet_full_text                   string[

In [54]:
column_kinds: dict[str, typing.Literal["unknown", "num", "bool", "text", "cat", "skip"]] = \
    dict.fromkeys(X_train_pre.columns, "unknown")

def _mark_resolved():
    inferred_dtypes.drop([k for k, v in column_kinds.items() if v != "unknown"], axis=0, errors="ignore", inplace=True)

In [55]:
column_kinds["challenge_id"] = "skip"
_mark_resolved()

In [56]:
print("Inferring kind as num for:")
for col in inferred_dtypes[(inferred_dtypes == "Int64") | (inferred_dtypes == "Float64")].index:
    print(f"> {col}")
    column_kinds[col] = "num"

_mark_resolved()

Inferring kind as num for:
> in_reply_to_status_id
> created_at
> quoted_status_created_at
> quoted_status_retweet_count
> quoted_status_favorite_count
> quoted_status_quote_count
> quoted_status_reply_count
> quoted_status_user_friends_count
> quoted_status_user_listed_count
> quoted_status_user_favourites_count
> quoted_status_user_created_at
> quoted_status_user_statuses_count
> quoted_status_user_followers_count
> user_listed_count
> user_favourites_count
> user_created_at
> user_statuses_count


In [57]:
print("Inferring kind as bool for:")
for col in inferred_dtypes[inferred_dtypes == "boolean"].index:
    print(f"> {col}")
    column_kinds[col] = "bool"

_mark_resolved()

Inferring kind as bool for:
> is_quote_status
> truncated
> quoted_status_is_quote_status
> quoted_status_favorited
> quoted_status_truncated
> quoted_status_user_default_profile_image
> quoted_status_user_is_translator
> quoted_status_user_protected
> quoted_status_user_geo_enabled
> quoted_status_user_verified
> quoted_status_user_contributors_enabled
> quoted_status_user_profile_background_tile
> quoted_status_user_profile_use_background_image
> quoted_status_user_default_profile
> user_is_translator
> user_geo_enabled
> user_profile_background_tile
> user_profile_use_background_image
> user_default_profile
> possibly_sensitive
> quoted_status_possibly_sensitive
> quoted_status_scopes_followers
> is_reply


In [58]:
list_like_cols: list[str] = [
    col
    for col in inferred_dtypes.index
    if X_train_pre[col].map(lambda x: isinstance(x, np.ndarray), na_action="ignore").all()
]

In [59]:
print("Inferring kind as skip for:")
for col in list_like_cols:
    # # Variable-length lists
    # if X_train[col].dropna().map(len).unique().size > 2:
    print(f"> {col}")
    column_kinds[col] = "skip"

_mark_resolved()

# TODO: Split fixed-length list columns, aka the following, into separate scalar columns:
# tuple_cols = [
#     "coordinates_coordinates",
#     "display_text_range",
#     "entities_media",
#     "extended_tweet_display_text_range",
#     "geo_coordinates",
#     "place_bounding_box_coordinates",
#     "quoted_status_coordinates_coordinates",
#     "quoted_status_display_text_range",
#     "quoted_status_entities_media",
#     "quoted_status_extended_tweet_display_text_range",
#     "quoted_status_geo_coordinates",
#     "quoted_status_place_bounding_box_coordinates",
#     "quoted_status_withheld_in_countries",
#     "withheld_in_countries",
# ]

Inferring kind as skip for:
> quoted_status_extended_tweet_entities_urls
> quoted_status_extended_tweet_entities_hashtags
> quoted_status_extended_tweet_entities_user_mentions
> quoted_status_extended_tweet_entities_symbols
> quoted_status_extended_tweet_display_text_range
> quoted_status_entities_urls
> quoted_status_entities_hashtags
> quoted_status_entities_user_mentions
> quoted_status_entities_symbols
> entities_urls
> entities_hashtags
> entities_user_mentions
> entities_symbols
> display_text_range
> extended_tweet_entities_urls
> extended_tweet_entities_hashtags
> extended_tweet_entities_user_mentions
> extended_tweet_entities_symbols
> extended_tweet_display_text_range
> quoted_status_extended_entities_media
> quoted_status_entities_media
> quoted_status_display_text_range
> extended_tweet_extended_entities_media
> extended_tweet_entities_media
> quoted_status_extended_tweet_extended_entities_media
> quoted_status_extended_tweet_entities_media
> place_bounding_box_coordinates


In [None]:
print("Inferring kind as cat for:")
for col in inferred_dtypes.index:
    col: str
    if col.endswith("_type") or col.endswith("_kind"):
        print(f"> {col}")
        column_kinds[col] = "cat"

_mark_resolved()

Inferring kind as cat for:
> quoted_status_user_translator_type
> user_translator_type
> place_bounding_box_type
> place_place_type
> quoted_status_place_bounding_box_type
> quoted_status_place_place_type
> quoted_status_geo_type
> quoted_status_coordinates_type
> geo_type
> coordinates_type


In [61]:
print("Inferring kind as text for:")
for col in [
    "full_text",
    "user_description",
    "misc_text",
    "quoted_status_text",
    "quoted_status_extended_tweet_full_text",
    "quoted_status_user_description",
]:
    print(f"> {col}")
    column_kinds[col] = "text"

_mark_resolved()

Inferring kind as text for:
> full_text
> user_description
> misc_text
> quoted_status_text
> quoted_status_extended_tweet_full_text
> quoted_status_user_description


In [62]:
print("Inferring kind as skip for:")
for col in inferred_dtypes[inferred_dtypes == "string[python]"].index:
    print(f"> {col}")
    column_kinds[col] = "skip"

_mark_resolved()

Inferring kind as skip for:
> source
> in_reply_to_screen_name
> text
> timestamp_ms
> quoted_status_source
> quoted_status_in_reply_to_screen_name
> quoted_status_lang
> quoted_status_user_profile_image_url_https
> quoted_status_user_profile_background_image_url
> quoted_status_user_profile_background_image_url_https
> quoted_status_user_screen_name
> quoted_status_user_profile_link_color
> quoted_status_user_profile_background_color
> quoted_status_user_profile_sidebar_border_color
> quoted_status_user_profile_text_color
> quoted_status_user_profile_image_url
> quoted_status_user_url
> quoted_status_user_profile_banner_url
> quoted_status_user_name
> quoted_status_user_location
> quoted_status_user_profile_sidebar_fill_color
> quoted_status_permalink_expanded
> user_profile_image_url_https
> user_profile_background_image_url
> user_profile_background_image_url_https
> user_profile_link_color
> user_profile_background_color
> user_profile_sidebar_border_color
> user_profile_text_color

In [63]:
assert len(inferred_dtypes) == 0, "Some columns have not been categorized!"

In [64]:
column_kinds: pd.Series = pd.Series(column_kinds)

In [70]:
def preprocess2(df: pd.DataFrame) -> pd.DataFrame:
    # skip_cols = list(column_kinds[column_kinds == "skip"].index)
    # df = df.drop(columns=skip_cols, errors="ignore")
    
    bool_cols = list(column_kinds[column_kinds == "bool"].index)
    df[bool_cols] = df[bool_cols].map({True: 1, False: -1, None: 0}.__getitem__).astype(int)
    
    cat_cols = list(column_kinds[column_kinds == "cat"].index)
    df[cat_cols] = df[cat_cols].fillna("none").astype(str)
    
    text_cols = list(column_kinds[column_kinds == "text"].index)
    df[text_cols] = df[text_cols].fillna("").astype(str)
    
    num_cols = list(column_kinds[column_kinds == "num"].index)
    df[num_cols] = df[num_cols].fillna(0).convert_dtypes()
    
    return df


In [71]:
X_train = preprocess2(X_train_pre)
X_kaggle = preprocess2(X_kaggle_pre)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


invalid value encountered in cast


invalid value encountered in cast


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


invalid value encountered in cast


invalid value encountered in cast



# Models

In [72]:
def train_test_split_group(
    X: pd.DataFrame,
    y: pd.Series,
    group: pd.Series,
    test_size: float = 0.15,
    random_state: int = 42
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    gss = GroupShuffleSplit(
        test_size=test_size,
        n_splits=1,
        random_state=random_state,
    )
    
    train_idx, val_idx = next(gss.split(X, y, group))
    
    return X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]


In [73]:
X_train_for_real, X_val, y_train_for_real, y_val = train_test_split_group(
    X_train,
    y_train,
    X_train["user_hash"],
    test_size=0.15,
    random_state=42,
)

In [74]:
def make_pool(X, y) -> catboost.Pool:
    num_cols = list(column_kinds[column_kinds == "num"].index)
    cat_cols = list(column_kinds[column_kinds.isin(["cat", "bool"])].index)
    text_cols = list(column_kinds[column_kinds == "text"].index)
    
    X = X[num_cols + cat_cols + text_cols]
    
    return catboost.Pool(
        data=X,
        label=y,
        cat_features=cat_cols,
        text_features=text_cols,
    )

train_pool = make_pool(X_train_for_real, y_train_for_real)
val_pool = make_pool(X_val, y_val)
kaggle_pool = make_pool(X_kaggle, None)


In [75]:
model = catboost.CatBoostClassifier(
    depth=10,
    learning_rate=0.1,
    iterations=2000,
    loss_function="Logloss",
    eval_metric="Accuracy",
    train_dir="./models/v12/t1",
    task_type="GPU",
    devices="0",
    random_seed=42,
)


In [None]:
# param_grid = {
#     "depth": [4, 6, 8, 10],
#     "learning_rate": [0.05, 0.1],
#     "iterations": [1000, 2000],
# }

# model.grid_search(
#     param_grid,
#     train_pool,
#     verbose=100,
#     plot=True,
#     plot_file="models/v12/gridsearch/grid_search.info",
# )

In [76]:
model.fit(
    train_pool,
    eval_set=val_pool,
    verbose=100,
    plot=True,
    plot_file="models/v12/t1/fit.info",
)

model.save_model("models/v12/t1/model.cbm")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8752897	test: 0.8184275	best: 0.8184275 (0)	total: 297ms	remaining: 9m 54s
100:	learn: 0.9139748	test: 0.8285935	best: 0.8285935 (95)	total: 11s	remaining: 3m 26s
200:	learn: 0.9228805	test: 0.8334406	best: 0.8338266 (198)	total: 20.3s	remaining: 3m 1s
300:	learn: 0.9298410	test: 0.8347274	best: 0.8347274 (300)	total: 29.7s	remaining: 2m 47s
400:	learn: 0.9353196	test: 0.8356711	best: 0.8361858 (395)	total: 38.9s	remaining: 2m 35s
500:	learn: 0.9391874	test: 0.8367434	best: 0.8367434 (488)	total: 48s	remaining: 2m 23s
600:	learn: 0.9425232	test: 0.8380732	best: 0.8381590 (594)	total: 57.2s	remaining: 2m 13s
700:	learn: 0.9455247	test: 0.8381590	best: 0.8385021 (688)	total: 1m 6s	remaining: 2m 3s
800:	learn: 0.9487618	test: 0.8384592	best: 0.8387595 (792)	total: 1m 17s	remaining: 1m 55s
900:	learn: 0.9512618	test: 0.8395316	best: 0.8396603 (895)	total: 1m 27s	remaining: 1m 47s
1000:	learn: 0.9538225	test: 0.8397890	best: 0.8403895 (973)	total: 1m 38s	remaining: 1m 38s
1100:	

In [77]:
preds = model.predict(kaggle_pool)

In [85]:
def reconcile_answers(preds: typing.Sequence[int]) -> typing.Sequence[int]:
    df = X_kaggle[["user_hash", "challenge_id"]].copy()
    df["pred_label"] = preds
    
    per_user_stats: dict[str, list[int]] = dict()
    for _, row in df.iterrows():
        per_user_stats.setdefault(row["user_hash"], [0, 0])[int(row["pred_label"])] += 1

    per_user_correct: dict[tuple[str, str], int] = dict()
    for key, stats in per_user_stats.items():
        if stats[0] == 0 or stats[1] == 0:
            continue
        
        per_user_correct[key] = np.select(
            [stats[0] > stats[1], stats[1] > stats[0]],
            [0, 1],
            default=np.random.randint(0, 2),
        )

    for idx, row in df.iterrows():
        if row["user_hash"] in per_user_correct:
            df.at[idx, "pred_label"] = per_user_correct[row["user_hash"]]
    
    return df["pred_label"].tolist()

In [78]:
output = pd.DataFrame({"ID": kaggle_data["challenge_id"], "Prediction": preds})
output.to_csv("models/v12/predictions-v12.csv", index=False)

In [86]:
output = pd.DataFrame({"ID": kaggle_data["challenge_id"], "Prediction": reconcile_answers(preds)})
output.to_csv("models/v12/predictions-v12-reconciled.csv", index=False)