In [1]:
from __future__ import annotations
import typing
import json
import pathlib
import os

import numpy as np
import pandas as pd
import torch

from nltk.corpus import stopwords

import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline

In [None]:
DATASETS = pathlib.Path(
    "."
    if "KAGGLE_DOCKER_IMAGE" not in os.environ
    else "/kaggle/input/influencers-or-observers-predicting-social-roles/Kaggle2025"
)

DATASET_TRAIN = DATASETS / "train.jsonl"
DATASET_KAGGLE = DATASETS / "kaggle_test.jsonl"

CACHE_DIR = pathlib.Path(".")

# Data loading

In [None]:
def load_json(path: pathlib.Path, cache: bool = False) -> pd.DataFrame:
    path_pq = (CACHE_DIR / path_pq.name).with_stem(f"{path.stem}_raw").with_suffix(".parquet")
    
    if cache and path_pq.exists():
        return pd.read_parquet(path_pq)
    
    # This leaves things to be desired, since there's no way to specify dtypes
    # and it assumes float instead of int, causing a loss in precision...
    # But I guess it only matters for ids, which we'll probably discard in preprocessing anyway
    result = pd.json_normalize(list(map(json.loads, path.read_bytes().splitlines())))
    
    if cache:
        result.to_parquet(path_pq)
    
    return result


In [4]:
train_data = load_json(DATASET_TRAIN, cache=True)
kaggle_data = load_json(DATASET_KAGGLE, cache=True)

OSError: [Errno 30] Read-only file system: '/kaggle/input/influencers-or-observers-predicting-social-roles/Kaggle2025/train_raw.parquet'

In [None]:
train_data.select_dtypes(include=["float64"]).head(5)

Unnamed: 0,in_reply_to_status_id,quoted_status_id,geo,in_reply_to_user_id,place,coordinates,quoted_status.in_reply_to_status_id,quoted_status.retweet_count,quoted_status.geo,quoted_status.in_reply_to_user_id,...,quoted_status.user.favourites_count,quoted_status.user.id,quoted_status.user.lang,quoted_status.user.time_zone,quoted_status.user.statuses_count,quoted_status.user.follow_request_sent,quoted_status.user.followers_count,quoted_status.user.following,quoted_status.user.notifications,quoted_status.quoted_status_id
0,,1.372171e+18,,,,,,43.0,,,...,427.0,492648852.0,,,23714.0,,1338833.0,,,
1,,1.372171e+18,,,,,,46.0,,,...,427.0,492648852.0,,,23714.0,,1338833.0,,,
2,1.372164e+18,,,1.06682e+18,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1.372165e+18,,,1522354000.0,,,,,,,...,,,,,,,,,,


In [None]:
list(train_data.columns)

['in_reply_to_status_id_str',
 'in_reply_to_status_id',
 'created_at',
 'in_reply_to_user_id_str',
 'source',
 'quoted_status_id',
 'retweet_count',
 'retweeted',
 'geo',
 'filter_level',
 'in_reply_to_screen_name',
 'is_quote_status',
 'id_str',
 'in_reply_to_user_id',
 'favorite_count',
 'text',
 'place',
 'lang',
 'quote_count',
 'favorited',
 'coordinates',
 'truncated',
 'timestamp_ms',
 'reply_count',
 'quoted_status_id_str',
 'contributors',
 'challenge_id',
 'label',
 'quoted_status.extended_tweet.entities.urls',
 'quoted_status.extended_tweet.entities.hashtags',
 'quoted_status.extended_tweet.entities.user_mentions',
 'quoted_status.extended_tweet.entities.symbols',
 'quoted_status.extended_tweet.full_text',
 'quoted_status.extended_tweet.display_text_range',
 'quoted_status.in_reply_to_status_id_str',
 'quoted_status.in_reply_to_status_id',
 'quoted_status.created_at',
 'quoted_status.in_reply_to_user_id_str',
 'quoted_status.source',
 'quoted_status.retweet_count',
 'quoted_

In [None]:
train_data.head(5)

Unnamed: 0,in_reply_to_status_id_str,in_reply_to_status_id,created_at,in_reply_to_user_id_str,source,quoted_status_id,retweet_count,retweeted,geo,filter_level,...,quoted_status.geo.coordinates,quoted_status.geo.type,quoted_status.coordinates.coordinates,quoted_status.coordinates.type,geo.coordinates,geo.type,coordinates.coordinates,coordinates.type,withheld_in_countries,quoted_status.withheld_in_countries
0,,,Wed Mar 17 13:01:59 +0000 2021,,"<a href=""http://twitter.com/download/iphone"" r...",1.372171e+18,0,False,,low,...,,,,,,,,,,
1,,,Wed Mar 17 13:02:06 +0000 2021,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1.372171e+18,0,False,,low,...,,,,,,,,,,
2,1.372163743040344e+18,1.372164e+18,Wed Mar 17 13:02:58 +0000 2021,1.0668199362727526e+18,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,0,False,,low,...,,,,,,,,,,
3,,,Wed Mar 17 13:03:00 +0000 2021,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,0,False,,low,...,,,,,,,,,,
4,1.3721647270617047e+18,1.372165e+18,Wed Mar 17 13:03:04 +0000 2021,1522353968.0,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,0,False,,low,...,,,,,,,,,,


In [None]:
X_train = train_data.drop("label", axis=1)
y_train = train_data["label"]

X_kaggle = kaggle_data

# Preprocessing

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    # TODO: First establish is_reply & similar flags?
    
    # Remove various ID fields
    # TODO: Could we actually use them for something?
    # Note: challenge_id and label seem to be added for the kaggle challenge
    df = df.drop(columns=[
        "in_reply_to_status_id_str",
        "in_reply_to_status_id",
        "in_reply_to_user_id_str",
        "in_reply_to_user_id",
        "quoted_status_id_str",
        "quoted_status_id",
        "id_str",
        "quoted_status.in_reply_to_status_id_str",
        "quoted_status.in_reply_to_status_id",
        "quoted_status.in_reply_to_user_id_str",
        "quoted_status.in_reply_to_user_id",
        "quoted_status.id_str",
        "quoted_status.id",
        "quoted_status.user.id_str",
        "quoted_status.user.id",
        "quoted_status_permalink.expanded",
        "quoted_status_permalink.display",
        "quoted_status_permalink.url",
        "quoted_status.quoted_status_id",
        "quoted_status.quoted_status_id_str",
        "quoted_status.place.id",
    ])
    
    df["full_text"] = df.apply(lambda tweet: extract_full_text(tweet), axis=1)
    
    return df


def extract_full_text(tweet: pd.Series) -> str:
    text: str = tweet["text"]
    
    if not pd.isna(tweet["extended_tweet.full_text"]):
        text = tweet["extended_tweet.full_text"]
    
    return text


In [None]:
# TODO: Put the preprocessing into the pipeline? Or not

X_train = preprocess(X_train)
X_kaggle = preprocess(X_kaggle)

# X_train.to_parquet("X_train.parquet")
# X_kaggle.to_parquet("X_kaggle.parquet")

In [None]:
# Note: rerun this to skip
# X_train = pd.read_parquet("X_train.parquet")
# X_kaggle = pd.read_parquet("X_kaggle.parquet")

# Models

In [None]:
# Load a list of common French stop words (e.g., "le", "la", "de")
french_stop_words = stopwords.words("french")

print("\nBuilding model pipeline...")

# Create a scikit-learn Pipeline. This chains steps together.
# Data will flow from "tfidf" (text to numbers) to "clf" (classifier).
model_pipeline = Pipeline([
    # Step 1: TfidfVectorizer - converts text into a matrix of TF-IDF features
    ("tfidf", TfidfVectorizer(
        stop_words=french_stop_words, # Remove French stop words
        max_df=0.7,       # Ignore words that appear in > 70% of tweets (too common)
        min_df=3,         # Ignore words that appear in < 3 tweets (too rare)
        max_features=1000, # Keep only the top 1000 features
        ngram_range=(1, 2),  # Include 1-word (unigrams) and 2-word (bigrams) sequences
    )),
    # Step 2: Classifier - Logistic Regression
    ("clf", LogisticRegression(
        random_state=42,    # For reproducible results
        solver="liblinear"  # Good solver for this type of problem
    ))
])

print("\nRunning 5-Fold Cross-Validation on training data...")

# Use StratifiedKFold to ensure class proportions are maintained in each fold
# This is important for datasets that might be imbalanced
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# cross_val_score will train and test the pipeline 5 times
# using the K-fold splits of the *training data*
scores = cross_val_score(
    model_pipeline,          # The pipeline to evaluate
    X_train["full_text"],  # Features from training set
    y_train,               # Labels from training set
    cv=kfold,              # The stratified 5-fold splitter
    scoring="accuracy"     # The metric to evaluate
)

# Print the cross-validation results
print(f"K-Fold Accuracy Scores: {scores}")
print(f"Mean K-Fold Accuracy: {np.mean(scores) * 100:.2f}%")
print(f"Std Dev K-Fold Accuracy: {np.std(scores) * 100:.2f}%")


print("\nTraining final model on all training data...")
# Now that we've validated the model, train it on ALL available training data
model_pipeline.fit(X_train["full_text"], y_train)
print("Training complete.")

print("\n--- Final Model Evaluation on Held-Out Test Set ---")
# Use the trained pipeline to make predictions on the unseen Kaggle data
# The pipeline automatically applies the TF-IDF transform and then predicts
y_pred_test = model_pipeline.predict(X_kaggle["full_text"])

# Prepare the submission file
# Combine the "challenge_id" from the Kaggle data with our predictions
output = pd.concat([X_kaggle["challenge_id"], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
# Rename columns to match the required submission format
output.columns = ["ID", "Prediction"]
# Save the submission file as a CSV
output.to_csv("logistic_regression.csv", index=False)


Building model pipeline...

Running 5-Fold Cross-Validation on training data...
K-Fold Accuracy Scores: [0.62634348 0.6281832  0.62114708 0.62337411 0.63036602]
Mean K-Fold Accuracy: 62.59%
Std Dev K-Fold Accuracy: 0.33%

Training final model on all training data...
Training complete.

--- Final Model Evaluation on Held-Out Test Set ---


In [104]:
print("\nTraining Dummy (Most Frequent)...")
# Create a DummyClassifier that always predicts the most frequent class
# This is a baseline to see if our Logistic Regression model is actually learning anything
dummy_mf = DummyClassifier(strategy="most_frequent")

# "Train" the dummy model (it just finds the most frequent class in y_train)
dummy_mf.fit(X_train["full_text"], y_train)

# Make predictions on the Kaggle data (it will predict the same class for all rows)
y_pred_test = dummy_mf.predict(X_kaggle["full_text"])

# Prepare and save the dummy submission file
output = pd.concat([X_kaggle["challenge_id"], pd.DataFrame(y_pred_test)], axis=1,ignore_index=True)
output.columns = ["ID", "Prediction"]
output.to_csv("dummy.csv", index=False)


Training Dummy (Most Frequent)...


In [106]:
dummy_mf.score(X_train["full_text"], y_train)

0.5336767496804679