In [25]:
#!pip install emoji catboost

In [26]:

from __future__ import annotations
import typing
import json
import pathlib
import os
import time
import datetime

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
import emoji

import sklearn
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import catboost

from tqdm import tqdm

import IPython
from IPython.display import display

In [27]:
IS_KAGGLE = "KAGGLE_DOCKER_IMAGE" in os.environ

DATASETS = pathlib.Path(
    "."
    if not IS_KAGGLE
    else "/kaggle/input/influencers-or-observers-predicting-social-roles/Kaggle2025"
)

DATASET_TRAIN = DATASETS / "train.jsonl"
DATASET_KAGGLE = DATASETS / "kaggle_test.jsonl"

CACHE_DIR = pathlib.Path(".")

In [28]:
np.random.seed(42)

# Data loading

In [29]:
def load_json(path: pathlib.Path, cache: bool = False) -> pd.DataFrame:
    path_pq = (CACHE_DIR / path.name).with_stem(f"{path.stem}_raw").with_suffix(".parquet")
    
    if cache and path_pq.exists():
        return pd.read_parquet(path_pq)
    
    # This leaves things to be desired, since there's no way to specify dtypes
    # and it assumes float instead of int, causing a loss in precision...
    # But I guess it only matters for ids, which we'll probably discard in preprocessing anyway
    result = pd.json_normalize(list(map(json.loads, path.read_bytes().splitlines())))
    
    if cache:
        result.to_parquet(path_pq)
    
    return result


In [30]:
train_data = load_json(DATASET_TRAIN, cache=True)
kaggle_data = load_json(DATASET_KAGGLE, cache=True)

# Preprocessing

In [31]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    # For technical reasons, any text columns we want to use should have no dots in their names.
    # The simplest way to achieve this is to replace all dots indiscriminately.
    
    df = df.rename(columns=lambda x: x.replace(".", "_"))
    
    df["is_reply"] = df["in_reply_to_status_id"].notna()
    
    df = df.drop(columns=[
        "in_reply_to_status_id_str",
        # "in_reply_to_status_id",
        "in_reply_to_user_id_str",
        "in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "id_str",
        "quoted_status_in_reply_to_status_id_str",
        "quoted_status_in_reply_to_status_id",
        "quoted_status_in_reply_to_user_id_str",
        "quoted_status_in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "quoted_status_user_id_str",
        "quoted_status_user_id",
        # "quoted_status_permalink_expanded",
        "quoted_status_permalink_display",
        "quoted_status_permalink_url",
        "quoted_status_quoted_status_id",
        "quoted_status_quoted_status_id_str",
        # "quoted_status_place_id",
        # "place_id",
        "lang",  # Always "fr"
        "retweeted",  # Always False
        "filter_level",  # Always "low"
        "geo",  # Always None
        "place",  # Always None
        "coordinates",  # Always None
        "contributors",  # Always None
        "quote_count",  # Always 0
        "reply_count",  # Always 0
        "retweet_count",  # Always 0
        "favorite_count",  # Always 0
        "favorited",  # Always False
        "quoted_status_geo",  # Always None
        "quoted_status_place",  # Always None
        "quoted_status_coordinates",  # Always None
        "quoted_status_retweeted",  # Always False
        "quoted_status_filter_level",  # Always "low"
        "quoted_status_contributors",  # Always None
        "quoted_status_user_utc_offset",  # Always None
        "quoted_status_user_lang",  # Always None
        "quoted_status_user_time_zone",  # Always None
        "quoted_status_user_follow_request_sent",  # Always None
        "quoted_status_user_following",  # Always None
        "quoted_status_user_notifications",  # Always None
        "user_default_profile_image",  # Always False
        "user_protected",  # Always False
        "user_contributors_enabled",  # Always False
        "user_lang",  # Always None
        "user_notifications",  # Always None
        "user_following",  # Always None
        "user_utc_offset",  # Always None
        "user_time_zone",  # Always None
        "user_follow_request_sent",  # Always None
    ])
    
    df["full_text"] = df.apply(lambda tweet: extract_full_text(tweet), axis=1)
    
    source_split = df["source"].str.removeprefix("<a href=\"").str.removesuffix("</a>").str.split("\" rel=\"nofollow\">").map(lambda x: x if len(x) == 2 else pd.NA)
    df["source_url"] = source_split.map(lambda x: x[0], na_action="ignore")
    df["source_name"] = source_split.map(lambda x: x[1], na_action="ignore")
    
    df["misc_text"] = df.apply(
        lambda x: "via: {0}; reply: @{1}; quote: @{2} {3}".format(x["source_name"], x["in_reply_to_screen_name"], x["quoted_status_user_screen_name"], x["quoted_status_user_name"]), axis=1,
    )
    
    dt_cols = ["created_at", "quoted_status_created_at", "quoted_status_user_created_at", "user_created_at"]
    df[dt_cols] = df[dt_cols].apply(pd.to_datetime, format="%a %b %d %H:%M:%S %z %Y", errors="coerce").map(pd.Timestamp.timestamp, na_action="ignore")
    
    return df


def extract_full_text(tweet: pd.Series) -> str:
    text: str = tweet["text"]
    
    if not pd.isna(tweet["extended_tweet_full_text"]):
        text = tweet["extended_tweet_full_text"]
    
    return text


In [32]:
X_train_pre = train_data.drop("label", axis=1)
y_train = train_data["label"]

X_kaggle_pre = kaggle_data

X_train_pre = preprocess(X_train_pre)
X_kaggle_pre = preprocess(X_kaggle_pre)

In [33]:
inferred_dtypes = pd.Series({
    col: X_train_pre[col].dropna().convert_dtypes().dtype
    for col in X_train_pre.columns
})

  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():


In [34]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(inferred_dtypes)

in_reply_to_status_id                                           Float64
created_at                                                        Int64
source                                                   string[python]
in_reply_to_screen_name                                  string[python]
is_quote_status                                                 boolean
text                                                     string[python]
truncated                                                       boolean
timestamp_ms                                             string[python]
challenge_id                                                      Int64
quoted_status_extended_tweet_entities_urls                       object
quoted_status_extended_tweet_entities_hashtags                   object
quoted_status_extended_tweet_entities_user_mentions              object
quoted_status_extended_tweet_entities_symbols                    object
quoted_status_extended_tweet_full_text                   string[

In [35]:
column_kinds: dict[str, typing.Literal["unknown", "num", "bool", "text", "cat", "skip"]] = \
    dict.fromkeys(X_train_pre.columns, "unknown")

def _mark_resolved():
    inferred_dtypes.drop([k for k, v in column_kinds.items() if v != "unknown"], axis=0, errors="ignore", inplace=True)

In [36]:
column_kinds["challenge_id"] = "skip"
_mark_resolved()

In [37]:
print("Inferring kind as num for:")
for col in inferred_dtypes[(inferred_dtypes == "Int64") | (inferred_dtypes == "Float64")].index:
    print(f"> {col}")
    column_kinds[col] = "num"

_mark_resolved()

Inferring kind as num for:
> in_reply_to_status_id
> created_at
> quoted_status_created_at
> quoted_status_retweet_count
> quoted_status_favorite_count
> quoted_status_quote_count
> quoted_status_reply_count
> quoted_status_user_friends_count
> quoted_status_user_listed_count
> quoted_status_user_favourites_count
> quoted_status_user_created_at
> quoted_status_user_statuses_count
> quoted_status_user_followers_count
> user_listed_count
> user_favourites_count
> user_created_at
> user_statuses_count


In [38]:
print("Inferring kind as bool for:")
for col in inferred_dtypes[inferred_dtypes == "boolean"].index:
    print(f"> {col}")
    column_kinds[col] = "bool"

_mark_resolved()

Inferring kind as bool for:
> is_quote_status
> truncated
> quoted_status_is_quote_status
> quoted_status_favorited
> quoted_status_truncated
> quoted_status_user_default_profile_image
> quoted_status_user_is_translator
> quoted_status_user_protected
> quoted_status_user_geo_enabled
> quoted_status_user_verified
> quoted_status_user_contributors_enabled
> quoted_status_user_profile_background_tile
> quoted_status_user_profile_use_background_image
> quoted_status_user_default_profile
> user_is_translator
> user_geo_enabled
> user_profile_background_tile
> user_profile_use_background_image
> user_default_profile
> possibly_sensitive
> quoted_status_possibly_sensitive
> quoted_status_scopes_followers
> is_reply


In [39]:
list_like_cols: list[str] = [
    col
    for col in inferred_dtypes.index
    if X_train_pre[col].map(lambda x: isinstance(x, np.ndarray), na_action="ignore").all()
]

In [40]:
print("Inferring kind as skip for:")
for col in list_like_cols:
    # # Variable-length lists
    # if X_train[col].dropna().map(len).unique().size > 2:
    print(f"> {col}")
    column_kinds[col] = "skip"

_mark_resolved()

# TODO: Split fixed-length list columns, aka the following, into separate scalar columns:
# tuple_cols = [
#     "coordinates_coordinates",
#     "display_text_range",
#     "entities_media",
#     "extended_tweet_display_text_range",
#     "geo_coordinates",
#     "place_bounding_box_coordinates",
#     "quoted_status_coordinates_coordinates",
#     "quoted_status_display_text_range",
#     "quoted_status_entities_media",
#     "quoted_status_extended_tweet_display_text_range",
#     "quoted_status_geo_coordinates",
#     "quoted_status_place_bounding_box_coordinates",
#     "quoted_status_withheld_in_countries",
#     "withheld_in_countries",
# ]

Inferring kind as skip for:
> quoted_status_extended_tweet_entities_urls
> quoted_status_extended_tweet_entities_hashtags
> quoted_status_extended_tweet_entities_user_mentions
> quoted_status_extended_tweet_entities_symbols
> quoted_status_extended_tweet_display_text_range
> quoted_status_entities_urls
> quoted_status_entities_hashtags
> quoted_status_entities_user_mentions
> quoted_status_entities_symbols
> entities_urls
> entities_hashtags
> entities_user_mentions
> entities_symbols
> display_text_range
> extended_tweet_entities_urls
> extended_tweet_entities_hashtags
> extended_tweet_entities_user_mentions
> extended_tweet_entities_symbols
> extended_tweet_display_text_range
> quoted_status_extended_entities_media
> quoted_status_entities_media
> quoted_status_display_text_range
> extended_tweet_extended_entities_media
> extended_tweet_entities_media
> quoted_status_extended_tweet_extended_entities_media
> quoted_status_extended_tweet_entities_media
> place_bounding_box_coordinates


In [41]:
print("Inferring kind as cat for:")
for col in inferred_dtypes.index:
    if col.endswith("_type") or col.endswith("_kind"):
        print(f"> {col}")
        column_kinds[col] = "cat"

_mark_resolved()

Inferring kind as cat for:
> quoted_status_user_translator_type
> user_translator_type
> place_bounding_box_type
> place_place_type
> quoted_status_place_bounding_box_type
> quoted_status_place_place_type
> quoted_status_geo_type
> quoted_status_coordinates_type
> geo_type
> coordinates_type


In [42]:
print("Inferring kind as text for:")
for col in inferred_dtypes[inferred_dtypes == "string[python]"].index:
    print(f"> {col}")
    column_kinds[col] = "text"

_mark_resolved()

Inferring kind as text for:
> source
> in_reply_to_screen_name
> text
> timestamp_ms
> quoted_status_extended_tweet_full_text
> quoted_status_source
> quoted_status_in_reply_to_screen_name
> quoted_status_text
> quoted_status_lang
> quoted_status_user_profile_image_url_https
> quoted_status_user_profile_background_image_url
> quoted_status_user_description
> quoted_status_user_profile_background_image_url_https
> quoted_status_user_screen_name
> quoted_status_user_profile_link_color
> quoted_status_user_profile_background_color
> quoted_status_user_profile_sidebar_border_color
> quoted_status_user_profile_text_color
> quoted_status_user_profile_image_url
> quoted_status_user_url
> quoted_status_user_profile_banner_url
> quoted_status_user_name
> quoted_status_user_location
> quoted_status_user_profile_sidebar_fill_color
> quoted_status_permalink_expanded
> user_profile_image_url_https
> user_profile_background_image_url
> user_description
> user_profile_background_image_url_https
> use

In [43]:
assert len(inferred_dtypes) == 0, "Some columns have not been categorized!"

In [44]:
column_kinds: pd.Series = pd.Series(column_kinds)

In [45]:
def preprocess2(df: pd.DataFrame) -> pd.DataFrame:
    skip_cols = list(column_kinds[column_kinds == "skip"].index)
    df = df.drop(columns=skip_cols, errors="ignore")
    
    bool_cols = list(column_kinds[column_kinds == "bool"].index)
    df[bool_cols] = df[bool_cols].map({True: 1, False: -1, None: 0}.__getitem__).astype(int)
    
    cat_cols = list(column_kinds[column_kinds == "cat"].index)
    df[cat_cols] = df[cat_cols].fillna("none").astype(str)
    
    text_cols = list(column_kinds[column_kinds == "text"].index)
    df[text_cols] = df[text_cols].fillna("").astype(str)
    
    num_cols = list(column_kinds[column_kinds == "num"].index)
    df[num_cols] = df[num_cols].fillna(0).convert_dtypes()
    
    return df


In [46]:
X_train = preprocess2(X_train_pre)
X_kaggle = preprocess2(X_kaggle_pre)

  df[num_cols] = df[num_cols].fillna(0).convert_dtypes()
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  df[num_cols] = df[num_cols].fillna(0).convert_dtypes()
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():


# Models

In [47]:
X_train_for_real, X_val, y_train_for_real, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [48]:
def make_pool(X, y) -> catboost.Pool:
    num_cols = list(column_kinds[column_kinds == "num"].index)
    cat_cols = list(column_kinds[column_kinds.isin(["cat", "bool"])].index)
    text_cols = list(column_kinds[column_kinds == "text"].index)
    
    X = X[num_cols + cat_cols + text_cols]
    
    return catboost.Pool(
        data=X,
        label=y,
        cat_features=cat_cols,
        text_features=text_cols,
    )

train_pool = make_pool(X_train_for_real, y_train_for_real)
val_pool = make_pool(X_val, y_val)
kaggle_pool = make_pool(X_kaggle, None)


In [None]:
model = catboost.CatBoostClassifier(
    loss_function="Logloss",
    random_seed=42,
    eval_metric="Accuracy",
    task_type="GPU",
    devices="0",
    # save_snapshot=True,
    # snapshot_interval=300,
    train_dir="./models/v12/gridsearch",
)

param_grid = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.05, 0.1],
    "iterations": [1000, 2000],
}

model.grid_search(
    param_grid,
    train_pool,
    verbose=100,
    plot=True,
    plot_file="models/v12/gridsearch/grid_search.info",
)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.7984147	test: 0.7995519	best: 0.7995519 (0)	total: 148ms	remaining: 2m 27s
1:	learn: 0.8018986	test: 0.8007670	best: 0.8007670 (1)	total: 274ms	remaining: 2m 16s
2:	learn: 0.8072432	test: 0.8066145	best: 0.8066145 (2)	total: 359ms	remaining: 1m 59s
3:	learn: 0.8074616	test: 0.8077916	best: 0.8077916 (3)	total: 439ms	remaining: 1m 49s
4:	learn: 0.8075470	test: 0.8083991	best: 0.8083991 (4)	total: 516ms	remaining: 1m 42s
5:	learn: 0.8116670	test: 0.8118165	best: 0.8118165 (5)	total: 607ms	remaining: 1m 40s
6:	learn: 0.8118568	test: 0.8121583	best: 0.8121583 (6)	total: 710ms	remaining: 1m 40s
7:	learn: 0.8111639	test: 0.8112470	best: 0.8121583 (6)	total: 785ms	remaining: 1m 37s
8:	learn: 0.8112778	test: 0.8118165	best: 0.8121583 (6)	total: 859ms	remaining: 1m 34s
9:	learn: 0.8117999	test: 0.8118165	best: 0.8121583 (6)	total: 938ms	remaining: 1m 32s
10:	learn: 0.8119423	test: 0.8121962	best: 0.8121962 (10)	total: 1.01s	remaining: 1m 30s
11:	learn: 0.8125878	test: 0.8110951	best

CatBoostError: catboost/private/libs/feature_estimator/text_feature_estimators.cpp:89: Dictionary size is 0, check out data or try to decrease occurrence_lower_bound parameter

In [None]:
model.save_model("models/v12/best.cbm")

In [None]:
preds = model.predict(kaggle_pool, task_type="GPU", devices="0")

In [None]:
output = pd.DataFrame({"ID": kaggle_data["challenge_id"], "Prediction": preds})
output.to_csv("models/v12/predictions-v12.csv", index=False)