In [139]:
from __future__ import annotations
import typing
import json
import pathlib
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import transformers.modeling_outputs
from transformers import AutoTokenizer, AutoModel

from nltk.corpus import stopwords

import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

In [67]:
DATASETS = pathlib.Path(
    "."
    if "KAGGLE_DOCKER_IMAGE" not in os.environ
    else "/kaggle/input/influencers-or-observers-predicting-social-roles/Kaggle2025"
)

DATASET_TRAIN = DATASETS / "train.jsonl"
DATASET_KAGGLE = DATASETS / "kaggle_test.jsonl"

CACHE_DIR = pathlib.Path(".")

# Data loading

In [68]:
def load_json(path: pathlib.Path, cache: bool = False) -> pd.DataFrame:
    path_pq = (CACHE_DIR / path.name).with_stem(f"{path.stem}_raw").with_suffix(".parquet")
    
    if cache and path_pq.exists():
        return pd.read_parquet(path_pq)
    
    # This leaves things to be desired, since there's no way to specify dtypes
    # and it assumes float instead of int, causing a loss in precision...
    # But I guess it only matters for ids, which we'll probably discard in preprocessing anyway
    result = pd.json_normalize(list(map(json.loads, path.read_bytes().splitlines())))
    
    if cache:
        result.to_parquet(path_pq)
    
    return result


In [69]:
train_data = load_json(DATASET_TRAIN, cache=True)
kaggle_data = load_json(DATASET_KAGGLE, cache=True)

In [74]:
list(train_data.columns)

['in_reply_to_status_id_str',
 'in_reply_to_status_id',
 'created_at',
 'in_reply_to_user_id_str',
 'source',
 'quoted_status_id',
 'retweet_count',
 'retweeted',
 'geo',
 'filter_level',
 'in_reply_to_screen_name',
 'is_quote_status',
 'id_str',
 'in_reply_to_user_id',
 'favorite_count',
 'text',
 'place',
 'lang',
 'quote_count',
 'favorited',
 'coordinates',
 'truncated',
 'timestamp_ms',
 'reply_count',
 'quoted_status_id_str',
 'contributors',
 'challenge_id',
 'label',
 'quoted_status.extended_tweet.entities.urls',
 'quoted_status.extended_tweet.entities.hashtags',
 'quoted_status.extended_tweet.entities.user_mentions',
 'quoted_status.extended_tweet.entities.symbols',
 'quoted_status.extended_tweet.full_text',
 'quoted_status.extended_tweet.display_text_range',
 'quoted_status.in_reply_to_status_id_str',
 'quoted_status.in_reply_to_status_id',
 'quoted_status.created_at',
 'quoted_status.in_reply_to_user_id_str',
 'quoted_status.source',
 'quoted_status.retweet_count',
 'quoted_

In [75]:
train_data.head(5)

Unnamed: 0,in_reply_to_status_id_str,in_reply_to_status_id,created_at,in_reply_to_user_id_str,source,quoted_status_id,retweet_count,retweeted,geo,filter_level,...,quoted_status.geo.coordinates,quoted_status.geo.type,quoted_status.coordinates.coordinates,quoted_status.coordinates.type,geo.coordinates,geo.type,coordinates.coordinates,coordinates.type,withheld_in_countries,quoted_status.withheld_in_countries
0,,,Wed Mar 17 13:01:59 +0000 2021,,"<a href=""http://twitter.com/download/iphone"" r...",1.372171e+18,0,False,,low,...,,,,,,,,,,
1,,,Wed Mar 17 13:02:06 +0000 2021,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1.372171e+18,0,False,,low,...,,,,,,,,,,
2,1.372163743040344e+18,1.372164e+18,Wed Mar 17 13:02:58 +0000 2021,1.0668199362727526e+18,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,0,False,,low,...,,,,,,,,,,
3,,,Wed Mar 17 13:03:00 +0000 2021,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,0,False,,low,...,,,,,,,,,,
4,1.3721647270617047e+18,1.372165e+18,Wed Mar 17 13:03:04 +0000 2021,1522353968.0,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,0,False,,low,...,,,,,,,,,,


# Preprocessing

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df["is_reply"] = df["in_reply_to_status_id"].notna()
    
    # Remove various ID fields
    # TODO: Could we actually use them for something?
    # Note: challenge_id and label seem to be added for the kaggle challenge
    df = df.drop(columns=[
        "in_reply_to_status_id_str",
        "in_reply_to_status_id",
        "in_reply_to_user_id_str",
        "in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "id_str",
        "quoted_status.in_reply_to_status_id_str",
        "quoted_status.in_reply_to_status_id",
        "quoted_status.in_reply_to_user_id_str",
        "quoted_status.in_reply_to_user_id",
        "quoted_status.id_str",
        "quoted_status.id",
        "quoted_status.user.id_str",
        "quoted_status.user.id",
        "quoted_status_permalink.expanded",
        "quoted_status_permalink.display",
        "quoted_status_permalink.url",
        "quoted_status.quoted_status_id",
        "quoted_status.quoted_status_id_str",
        "quoted_status.place.id",
        "lang",  # Always "fr"
        "retweeted",  # Always False
        "filter_level",  # Always "low"
        "geo",  # Always None
        "place",  # Always None
        "coordinates",  # Always None
        "contributors",  # Always None
        "quote_count",  # Always 0
        "reply_count",  # Always 0
        "retweet_count",  # Always 0
        "favorite_count",  # Always 0
        "favorited",  # Always False
        "quoted_status.geo",  # Always None
        "quoted_status.place",  # Always None
        "quoted_status.coordinates",  # Always None
        "quoted_status.retweeted",  # Always False
        "quoted_status.filter_level",  # Always "low"
        "quoted_status.contributors",  # Always None
        "quoted_status.user.utc_offset",  # Always None
        "quoted_status.user.lang",  # Always None
        "quoted_status.user.time_zone",  # Always None
        "quoted_status.user.follow_request_sent",  # Always None
        "quoted_status.user.following",  # Always None
        "quoted_status.user.notifications",  # Always None
        "user.default_profile_image",  # Always False
        "user.protected",  # Always False
        "user.contributors_enabled",  # Always False
    ])
    
    df["full_text"] = df.apply(lambda tweet: extract_full_text(tweet), axis=1)
    
    return df


def extract_full_text(tweet: pd.Series) -> str:
    text: str = tweet["text"]
    
    if not pd.isna(tweet["extended_tweet.full_text"]):
        text = tweet["extended_tweet.full_text"]
    
    return text


In [137]:
X_train = train_data.drop("label", axis=1)
y_train = train_data["label"]

X_kaggle = kaggle_data

X_train = preprocess(X_train)
X_kaggle = preprocess(X_kaggle)

# Models

In [138]:
list(X_train.columns)

['created_at',
 'source',
 'in_reply_to_screen_name',
 'is_quote_status',
 'text',
 'truncated',
 'timestamp_ms',
 'challenge_id',
 'quoted_status.extended_tweet.entities.urls',
 'quoted_status.extended_tweet.entities.hashtags',
 'quoted_status.extended_tweet.entities.user_mentions',
 'quoted_status.extended_tweet.entities.symbols',
 'quoted_status.extended_tweet.full_text',
 'quoted_status.extended_tweet.display_text_range',
 'quoted_status.created_at',
 'quoted_status.source',
 'quoted_status.retweet_count',
 'quoted_status.in_reply_to_screen_name',
 'quoted_status.is_quote_status',
 'quoted_status.favorite_count',
 'quoted_status.text',
 'quoted_status.lang',
 'quoted_status.quote_count',
 'quoted_status.favorited',
 'quoted_status.truncated',
 'quoted_status.reply_count',
 'quoted_status.entities.urls',
 'quoted_status.entities.hashtags',
 'quoted_status.entities.user_mentions',
 'quoted_status.entities.symbols',
 'quoted_status.user.friends_count',
 'quoted_status.user.profile_ima

In [None]:
# TODO: discard quoted_status.lang != "fr"?
# TODO: some tweets are images

In [140]:
NUM_CLASSES = 2

class TweetClassifier(nn.Module):
    tokenizer: nn.Module
    encoder: nn.Module
    metadata_dim: int
    fc1: nn.Module
    fc2: nn.Module
    
    def __init__(
        self,
        pretrained_encoder: str = "Geotrend/distilbert-base-en-fr-cased",
        metadata_dim: int = 16,
        hidden_dim: int = 128,
        max_length: int = 256,
    ):
        super().__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder)
        self.encoder = AutoModel.from_pretrained(pretrained_encoder)

        # Don't finetune the encoder... yet?
        for param in self.encoder.parameters():
            param.requires_grad = False
        
        self.encoder_dim = self.encoder.config.hidden_size
        self.max_length = max_length

        self.metadata_dim = metadata_dim
        
        self.fc1 = nn.Linear(self.encoder_dim + metadata_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, NUM_CLASSES)
    
    @property
    def device(self) -> torch.device:
        return next(self.parameters()).device
    
    def encode_text(self, texts: list[str]) -> torch.Tensor:
        encoded: torch.Tensor = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).to(self.device)

        outputs: transformers.modeling_outputs.BaseModelOutput = self.encoder(**encoded)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        return cls_embeddings  # [batch, encoder_dim]
    
    def forward(
        self,
        texts: list[str] | torch.Tensor,
        metadata: torch.Tensor,
    ) -> dict[str, torch.Tensor]:
        """
        Returns dict with:
            "logits": tensor [batch_size, num_classes]
            "probs": tensor [batch_size, num_classes]
        """
        device = self.device
        batch_size = len(texts)
        
        if isinstance(texts, torch.Tensor):
            text_vecs = texts
        else:
            text_vecs = self.encode_text(texts)  # [B, encoder_dim]

        metadata = metadata.to(device)
        assert metadata.shape == (batch_size, self.metadata_dim)
        
        x = torch.cat([text_vecs, metadata], dim=1)

        hidden = F.relu(self.fc1(x))
        logits = self.fc2(hidden)
        probs = F.softmax(logits, dim=-1)

        return {
            "logits": logits,
            "probs": probs,
        }


In [None]:
# Load a list of common French stop words (e.g., "le", "la", "de")
french_stop_words = stopwords.words("french")

print("\nBuilding model pipeline...")

# Create a scikit-learn Pipeline. This chains steps together.
# Data will flow from "tfidf" (text to numbers) to "clf" (classifier).
model_pipeline = Pipeline([
    # Step 1: TfidfVectorizer - converts text into a matrix of TF-IDF features
    ("tfidf", TfidfVectorizer(
        stop_words=french_stop_words, # Remove French stop words
        max_df=0.7,       # Ignore words that appear in > 70% of tweets (too common)
        min_df=3,         # Ignore words that appear in < 3 tweets (too rare)
        max_features=1000, # Keep only the top 1000 features
        ngram_range=(1, 2),  # Include 1-word (unigrams) and 2-word (bigrams) sequences
    )),
    # Step 2: Classifier - Logistic Regression
    ("clf", LogisticRegression(
        random_state=42,    # For reproducible results
        solver="liblinear"  # Good solver for this type of problem
    ))
])

print("\nRunning 5-Fold Cross-Validation on training data...")

# Use StratifiedKFold to ensure class proportions are maintained in each fold
# This is important for datasets that might be imbalanced
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cross_val_score will train and test the pipeline 5 times
# using the K-fold splits of the *training data*
scores = cross_val_score(
    model_pipeline,          # The pipeline to evaluate
    X_train["full_text"],  # Features from training set
    y_train,               # Labels from training set
    cv=kfold,              # The stratified 5-fold splitter
    scoring="accuracy"     # The metric to evaluate
)

# Print the cross-validation results
print(f"K-Fold Accuracy Scores: {scores}")
print(f"Mean K-Fold Accuracy: {np.mean(scores) * 100:.2f}%")
print(f"Std Dev K-Fold Accuracy: {np.std(scores) * 100:.2f}%")


print("\nTraining final model on all training data...")
# Now that we've validated the model, train it on ALL available training data
model_pipeline.fit(X_train["full_text"], y_train)
print("Training complete.")

print("\n--- Final Model Evaluation on Held-Out Test Set ---")
# Use the trained pipeline to make predictions on the unseen Kaggle data
# The pipeline automatically applies the TF-IDF transform and then predicts
y_pred_test = model_pipeline.predict(X_kaggle["full_text"])

# Prepare the submission file
# Combine the "challenge_id" from the Kaggle data with our predictions
output = pd.concat([X_kaggle["challenge_id"], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
# Rename columns to match the required submission format
output.columns = ["ID", "Prediction"]
# Save the submission file as a CSV
output.to_csv("logistic_regression.csv", index=False)


Building model pipeline...

Running 5-Fold Cross-Validation on training data...
K-Fold Accuracy Scores: [0.62634348 0.6281832  0.62114708 0.62337411 0.63036602]
Mean K-Fold Accuracy: 62.59%
Std Dev K-Fold Accuracy: 0.33%

Training final model on all training data...
Training complete.

--- Final Model Evaluation on Held-Out Test Set ---


In [104]:
print("\nTraining Dummy (Most Frequent)...")
# Create a DummyClassifier that always predicts the most frequent class
# This is a baseline to see if our Logistic Regression model is actually learning anything
dummy_mf = DummyClassifier(strategy="most_frequent")

# "Train" the dummy model (it just finds the most frequent class in y_train)
dummy_mf.fit(X_train["full_text"], y_train)

# Make predictions on the Kaggle data (it will predict the same class for all rows)
y_pred_test = dummy_mf.predict(X_kaggle["full_text"])

# Prepare and save the dummy submission file
output = pd.concat([X_kaggle["challenge_id"], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
output.columns = ["ID", "Prediction"]
output.to_csv("dummy.csv", index=False)


Training Dummy (Most Frequent)...


In [106]:
dummy_mf.score(X_train["full_text"], y_train)

0.5336767496804679