In [4]:
from __future__ import annotations
import typing
import json
import pathlib
import os
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_

import transformers
import transformers.modeling_outputs
from transformers import AutoTokenizer, AutoModel

from nltk.corpus import stopwords

import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

from tqdm import tqdm

In [5]:
DATASETS = pathlib.Path(
    "."
    if "KAGGLE_DOCKER_IMAGE" not in os.environ
    else "/kaggle/input/influencers-or-observers-predicting-social-roles/Kaggle2025"
)

DATASET_TRAIN = DATASETS / "train.jsonl"
DATASET_KAGGLE = DATASETS / "kaggle_test.jsonl"

CACHE_DIR = pathlib.Path(".")

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Data loading

In [7]:
def load_json(path: pathlib.Path, cache: bool = False) -> pd.DataFrame:
    path_pq = (CACHE_DIR / path.name).with_stem(f"{path.stem}_raw").with_suffix(".parquet")
    
    if cache and path_pq.exists():
        return pd.read_parquet(path_pq)
    
    # This leaves things to be desired, since there's no way to specify dtypes
    # and it assumes float instead of int, causing a loss in precision...
    # But I guess it only matters for ids, which we'll probably discard in preprocessing anyway
    result = pd.json_normalize(list(map(json.loads, path.read_bytes().splitlines())))
    
    if cache:
        result.to_parquet(path_pq)
    
    return result


In [8]:
train_data = load_json(DATASET_TRAIN, cache=True)
kaggle_data = load_json(DATASET_KAGGLE, cache=True)

# Preprocessing

In [9]:

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df["is_reply"] = df["in_reply_to_status_id"].notna()
    
    # Remove various ID fields
    # TODO: Could we actually use them for something?
    # Note: challenge_id and label seem to be added for the kaggle challenge
    df = df.drop(columns=[
        "in_reply_to_status_id_str",
        "in_reply_to_status_id",
        "in_reply_to_user_id_str",
        "in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "id_str",
        "quoted_status.in_reply_to_status_id_str",
        "quoted_status.in_reply_to_status_id",
        "quoted_status.in_reply_to_user_id_str",
        "quoted_status.in_reply_to_user_id",
        "quoted_status.id_str",
        "quoted_status.id",
        "quoted_status.user.id_str",
        "quoted_status.user.id",
        "quoted_status_permalink.expanded",
        "quoted_status_permalink.display",
        "quoted_status_permalink.url",
        "quoted_status.quoted_status_id",
        "quoted_status.quoted_status_id_str",
        "quoted_status.place.id",
        "place.id",
        "lang",  # Always "fr"
        "retweeted",  # Always False
        "filter_level",  # Always "low"
        "geo",  # Always None
        "place",  # Always None
        "coordinates",  # Always None
        "contributors",  # Always None
        "quote_count",  # Always 0
        "reply_count",  # Always 0
        "retweet_count",  # Always 0
        "favorite_count",  # Always 0
        "favorited",  # Always False
        "quoted_status.geo",  # Always None
        "quoted_status.place",  # Always None
        "quoted_status.coordinates",  # Always None
        "quoted_status.retweeted",  # Always False
        "quoted_status.filter_level",  # Always "low"
        "quoted_status.contributors",  # Always None
        "quoted_status.user.utc_offset",  # Always None
        "quoted_status.user.lang",  # Always None
        "quoted_status.user.time_zone",  # Always None
        "quoted_status.user.follow_request_sent",  # Always None
        "quoted_status.user.following",  # Always None
        "quoted_status.user.notifications",  # Always None
        "user.default_profile_image",  # Always False
        "user.protected",  # Always False
        "user.contributors_enabled",  # Always False
        "user.lang",  # Always None
        "user.notifications",  # Always None
        "user.following",  # Always None
        "user.utc_offset",  # Always None
        "user.time_zone",  # Always None
        "user.follow_request_sent",  # Always None
    ])
    
    # TODO: Augment text with other string features?
    df["full_text"] = df.apply(lambda tweet: extract_full_text(tweet), axis=1)
    
    return df


def extract_full_text(tweet: pd.Series) -> str:
    text: str = tweet["text"]
    
    if not pd.isna(tweet["extended_tweet.full_text"]):
        text = tweet["extended_tweet.full_text"]
    
    return text


In [10]:
X_train = train_data.drop("label", axis=1)
y_train = train_data["label"]

X_kaggle = kaggle_data

X_train = preprocess(X_train)
X_kaggle = preprocess(X_kaggle)

# Models

In [11]:
# TODO: discard quoted_status.lang != "fr"?
# TODO: some tweets are images

In [25]:

NUM_CLASSES = 2

class TweetClassifier(nn.Module):
    tokenizer: nn.Module
    encoder: nn.Module
    metadata_dim: int
    md_layernorm: nn.Module
    fc1: nn.Module
    fc2: nn.Module
    
    def __init__(
        self,
        pretrained_encoder: str = "camembert-base",  # "Geotrend/distilbert-base-en-fr-cased", "flaubert/flaubert_base_cased", "flaubert/flaubert_small_cased"
        metadata_dim: int = 16,
        hidden_dim: int = 128,
        max_length: int = 256,
    ):
        super().__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_encoder)
        self.encoder = AutoModel.from_pretrained(pretrained_encoder)

        # Don't finetune the encoder... yet?
        for param in self.encoder.parameters():
            param.requires_grad = False
        
        self.encoder_dim = self.encoder.config.hidden_size
        self.max_length = max_length

        self.metadata_dim = metadata_dim
        self.md_layernorm = nn.LayerNorm(metadata_dim)
        
        self.fc1 = nn.Linear(self.encoder_dim + metadata_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, NUM_CLASSES)
    
    @property
    def device(self) -> torch.device:
        return next(self.parameters()).device
    
    def encode_text(self, texts: list[str]) -> torch.Tensor:
        encoded: torch.Tensor = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).to(self.device)

        outputs: transformers.modeling_outputs.BaseModelOutput = self.encoder(**encoded)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        return cls_embeddings  # [batch, encoder_dim]
    
    def forward(
        self,
        texts: list[str] | torch.Tensor,
        metadata: torch.Tensor,
    ) -> dict[str, torch.Tensor]:
        """
        Returns dict with:
            "logits": tensor [batch_size, num_classes]
            "probs": tensor [batch_size, num_classes]
        """
        device = self.device
        batch_size = len(texts)
        
        if isinstance(texts, torch.Tensor):
            text_vecs = texts
        else:
            text_vecs = self.encode_text(texts)  # [B, encoder_dim]

        metadata = metadata.to(device)
        assert metadata.shape == (batch_size, self.metadata_dim)
        
        metadata = self.md_layernorm(metadata)
        
        x = torch.cat([text_vecs, metadata], dim=1)

        hidden = F.relu(self.fc1(x))
        logits = self.fc2(hidden)
        probs = F.softmax(logits, dim=-1)

        return {
            "logits": logits,
            "probs": probs,
        }


In [26]:
METADATA_DIM = 13  # Note: update when adding new fields!!

def extract_metadata(df: pd.DataFrame) -> torch.Tensor:
    md: list[pd.Series] = []

    def md_add_bool(col: str):
        md.append(df[col].apply(lambda x: (1 if x else -1) if pd.notnull(x) else 0))

    md_add_bool("is_quote_status")
    md_add_bool("is_reply")
    md_add_bool("possibly_sensitive")
    md_add_bool("quoted_status.user.verified")

    def md_add_len(col: str):
        # pd.notnull considered lists as not scalar
        md.append(df[col].apply(lambda x: len(x) if x is not None and (isinstance(x, list) or pd.notnull(x)) else 0))

    md_add_len("full_text")
    md_add_len("extended_tweet.entities.urls")
    md_add_len("extended_tweet.entities.hashtags")
    md_add_len("extended_tweet.entities.user_mentions")
    md_add_len("extended_tweet.entities.symbols")

    def md_add_time(col: str):
        tmp = df[col].apply(lambda x: time.mktime(time.strptime(x, "%a %b %d %H:%M:%S %z %Y")) if pd.notnull(x) else pd.NA)
        md.append(tmp.fillna(tmp.mean()))

    md_add_time("created_at")  # TODO: Same as timestamp_ms / 1000?
    md_add_time("quoted_status.user.created_at")

    def md_add_num(col: str):
        tmp = df[col].apply(pd.to_numeric)
        md.append(tmp.fillna(tmp.mean()))

    md_add_num("quoted_status.user.followers_count")
    md_add_num("timestamp_ms")

    return torch.from_numpy(np.array(md)).transpose(0, 1).float()


In [27]:
class TweetDataset(Dataset):
    texts: list[str]
    metadata: torch.Tensor
    labels: torch.Tensor
    device: torch.device
    
    def __init__(self, df: pd.DataFrame, labels: pd.Series, device: torch.device = device):
        self.texts = df["full_text"].tolist()
        self.metadata = extract_metadata(df).to(device)
        self.labels = torch.tensor(labels, dtype=torch.long, device=device)
        self.device = device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            "text": self.texts[idx],
            "metadata": self.metadata[idx],
            "label": self.labels[idx],
        }


def collate_fn(batch):
    texts = [x["text"] for x in batch]
    metadata = torch.stack([x["metadata"] for x in batch])
    labels = torch.stack([x["label"] for x in batch])
    return texts, metadata, labels


In [28]:
def train_model(
    model: TweetClassifier,
    train_loader: DataLoader,
    epochs: int = 3,
    lr: float = 2e-4,
    weight_decay: float = 0.01,
    max_grad_norm: float = 1.0,
    device: torch.device = device,
) -> TweetClassifier:
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(1, epochs + 1):
        print(f"Epoch {epoch}/{epochs}")
        model.train()
        total_loss = 0.0

        status_bar = tqdm(train_loader, desc="Training")

        for texts, metadata, labels in status_bar:
            texts: list[str]
            metadata: torch.Tensor
            labels: torch.Tensor
            
            metadata = metadata.to(device)
            labels = labels.to(device)

            optimizer.zero_grad(set_to_none=True)

            out = model(
                texts=texts,
                metadata=metadata,
            )
            logits = out["logits"]
            
            loss = criterion(logits, labels)
            loss.backward()
            clip_grad_norm_(model.parameters(), max_grad_norm)  # TODO: ?
            optimizer.step()

            total_loss += loss.item()
            status_bar.set_postfix({"loss": total_loss / (status_bar.n + 1)})

        print(f"Train Loss: {total_loss / len(train_loader):.4f}")

    return model


In [34]:
def evaluate_model(
    model: TweetClassifier,
    data_loader: DataLoader,
    device: torch.device = device,
) -> tuple[float, float]:
    """
    Returns loss and accuracy
    """
    
    model.eval()
    criterion = torch.nn.CrossEntropyLoss()

    total_loss = 0.0
    correct = 0
    count = 0

    with torch.no_grad():
        for texts, metadata, labels in data_loader:
            texts: list[str]
            metadata: torch.Tensor
            labels: torch.Tensor
            metadata = metadata.to(device)
            labels = labels.to(device)

            out = model(
                texts=texts,
                metadata=metadata,
            )
            logits: torch.Tensor = out["logits"]
            
            loss: torch.Tensor = criterion(logits, labels)
            total_loss += loss.item()
            preds = logits.argmax(dim=-1)
            correct += (preds == labels).sum().item()
            count += labels.size(0)

    return total_loss / len(data_loader), correct / count


In [30]:
model = TweetClassifier(
    pretrained_encoder="camembert-base",
    metadata_dim=METADATA_DIM,
    hidden_dim=128,
    max_length=256
).to(device)

In [31]:
full_train_ds = TweetDataset(X_train, y_train, device=device)

train_ds, val_ds = random_split(full_train_ds, [0.9, 0.1])

  md.append(tmp.fillna(tmp.mean()))


In [32]:
train_loader = DataLoader(
    train_ds,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn,
)

model = train_model(model, train_loader, epochs=3, device=device)
torch.save(model.state_dict(), "model-v1.pt")

Epoch 1/3


Training: 100%|██████████| 4357/4357 [15:26<00:00,  4.70it/s, loss=0.636]


Train Loss: 0.6363
Epoch 2/3


Training: 100%|██████████| 4357/4357 [15:37<00:00,  4.65it/s, loss=0.621]


Train Loss: 0.6208
Epoch 3/3


Training: 100%|██████████| 4357/4357 [15:36<00:00,  4.65it/s, loss=0.616]


Train Loss: 0.6163


In [None]:
val_loader = DataLoader(
    val_ds,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn,
)

evaluate_model(model, val_loader, device=device)

In [206]:
# Load a list of common French stop words (e.g., "le", "la", "de")
french_stop_words = stopwords.words("french")

print("\nBuilding model pipeline...")

# Create a scikit-learn Pipeline. This chains steps together.
# Data will flow from "tfidf" (text to numbers) to "clf" (classifier).
model_pipeline = Pipeline([
    # Step 1: TfidfVectorizer - converts text into a matrix of TF-IDF features
    ("tfidf", TfidfVectorizer(
        stop_words=french_stop_words, # Remove French stop words
        max_df=0.7,       # Ignore words that appear in > 70% of tweets (too common)
        min_df=3,         # Ignore words that appear in < 3 tweets (too rare)
        max_features=1000, # Keep only the top 1000 features
        ngram_range=(1, 2),  # Include 1-word (unigrams) and 2-word (bigrams) sequences
    )),
    # Step 2: Classifier - Logistic Regression
    ("clf", LogisticRegression(
        random_state=42,    # For reproducible results
        solver="liblinear"  # Good solver for this type of problem
    ))
])

print("\nRunning 5-Fold Cross-Validation on training data...")

# Use StratifiedKFold to ensure class proportions are maintained in each fold
# This is important for datasets that might be imbalanced
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cross_val_score will train and test the pipeline 5 times
# using the K-fold splits of the *training data*
scores = cross_val_score(
    model_pipeline,          # The pipeline to evaluate
    X_train["full_text"],  # Features from training set
    y_train,               # Labels from training set
    cv=kfold,              # The stratified 5-fold splitter
    scoring="accuracy"     # The metric to evaluate
)

# Print the cross-validation results
print(f"K-Fold Accuracy Scores: {scores}")
print(f"Mean K-Fold Accuracy: {np.mean(scores) * 100:.2f}%")
print(f"Std Dev K-Fold Accuracy: {np.std(scores) * 100:.2f}%")


print("\nTraining final model on all training data...")
# Now that we've validated the model, train it on ALL available training data
model_pipeline.fit(X_train["full_text"], y_train)
print("Training complete.")

print("\n--- Final Model Evaluation on Held-Out Test Set ---")
# Use the trained pipeline to make predictions on the unseen Kaggle data
# The pipeline automatically applies the TF-IDF transform and then predicts
y_pred_test = model_pipeline.predict(X_kaggle["full_text"])

# Prepare the submission file
# Combine the "challenge_id" from the Kaggle data with our predictions
output = pd.concat([X_kaggle["challenge_id"], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
# Rename columns to match the required submission format
output.columns = ["ID", "Prediction"]
# Save the submission file as a CSV
output.to_csv("logistic_regression.csv", index=False)


Building model pipeline...

Running 5-Fold Cross-Validation on training data...


KeyboardInterrupt: 

In [104]:
print("\nTraining Dummy (Most Frequent)...")
# Create a DummyClassifier that always predicts the most frequent class
# This is a baseline to see if our Logistic Regression model is actually learning anything
dummy_mf = DummyClassifier(strategy="most_frequent")

# "Train" the dummy model (it just finds the most frequent class in y_train)
dummy_mf.fit(X_train["full_text"], y_train)

# Make predictions on the Kaggle data (it will predict the same class for all rows)
y_pred_test = dummy_mf.predict(X_kaggle["full_text"])

# Prepare and save the dummy submission file
output = pd.concat([X_kaggle["challenge_id"], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
output.columns = ["ID", "Prediction"]
output.to_csv("dummy.csv", index=False)


Training Dummy (Most Frequent)...


In [106]:
dummy_mf.score(X_train["full_text"], y_train)

0.5336767496804679