# Translate all Reddit and Twitter messages

This parallelized code was run on an A100 GPU to translate **890K Reddit posts** and **1.2M Tweets**. Because it relies on the free model *Helsinki-NLP/opus-mt-mul-en*, which limits the input to **512 tokens**, each message must be **chunked into smaller segments** before translation. This significantly slows down the overall process, even when using a GPU.

As a result, the translation took approximately **3 hours** for the 890K Reddit messages and around **18 hours** for the 1.2M Tweets.


In [None]:
# ===== SETUP =====
!pip install polars transformers accelerate sentencepiece -q

import torch
import polars as pl
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import drive

# Mount Google Drive
drive.mount("/content/drive")

# ===== MODEL & PIPELINE =====

TRANSLATOR_MODEL = "Helsinki-NLP/opus-mt-mul-en"

use_cuda = torch.cuda.is_available()
device = 0 if use_cuda else -1
dtype = torch.float16 if use_cuda else torch.float32

print("CUDA available:", use_cuda)
print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(TRANSLATOR_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(
    TRANSLATOR_MODEL,
    torch_dtype=dtype if use_cuda else torch.float32,
)

translation_pipe = pipeline(
    "translation",
    model=model,
    tokenizer=tokenizer,
    device=device,
)

MAX_MODEL_LEN = 512
SAFE_INPUT_TOKENS = 384  # input truncation limit


# ===== DATA LOADING =====

def load_timestamp_text_2018_2019(csv_path: str, part: int | None = None) -> pl.DataFrame:
    """
    Load the timestamp/text CSV using Polars, ensure chronological order,
    and optionally return one of six equal-sized time-slices for 2018.

    Parameters
    ----------
    csv_path : str
        Path to the CSV file.
    part : int or None
        If 1-6, return the corresponding 1/6 time-slice of [2018-01-01, 2019-01-01).
        If None, return the full dataset.

    Returns
    -------
    pl.DataFrame
    """
    schema = {
        "timestamp": pl.Datetime,
        "text": pl.Utf8,
    }

    df = (
        pl.read_csv(
            csv_path,
            schema=schema,
            has_header=True,
            separator=",",
        )
        .sort("timestamp")  # ensure time order
    )

    # If no part requested, return full df
    if part is None:
        return df

    # Validate part
    if not (1 <= part <= 6):
        raise ValueError("part must be an integer between 1 and 6, or None.")

    # Fixed absolute boundaries for the year
    year_start = datetime(2018, 1, 1)
    year_end = datetime(2019, 1, 1)

    # Split [year_start, year_end) into 6 equal time intervals
    total_span = year_end - year_start
    step = total_span / 6  # timedelta

    slice_start = year_start + step * (part - 1)
    slice_end = year_start + step * part

    # Filter rows whose timestamp falls into [slice_start, slice_end)
    df_part = df.filter(
        (pl.col("timestamp") >= slice_start) &
        (pl.col("timestamp") < slice_end)
    )

    return df_part


# ===== TRANSLATION HELPERS =====

def trim_for_model(text: str) -> str:
    """Tokenize and hard-trim long inputs so we stay under SAFE_INPUT_TOKENS."""
    if not isinstance(text, str) or not text.strip():
        return ""
    ids = tokenizer.encode(text, add_special_tokens=True)
    if len(ids) > SAFE_INPUT_TOKENS:
        ids = ids[:SAFE_INPUT_TOKENS]
    return tokenizer.decode(ids, skip_special_tokens=True)


def translate_list(texts, batch_size: int = 128):
    """
    Translate a list of texts (may contain empty / non-string entries).
    Returns a list of English strings of the same length.
    """
    n = len(texts)
    translations = [""] * n

    # Indices of non-empty strings
    idxs = [i for i, t in enumerate(texts) if isinstance(t, str) and t.strip()]
    if not idxs:
        return translations

    to_translate = [trim_for_model(texts[i]) for i in idxs]

    # Run pipeline on the non-empty subset
    outputs = translation_pipe(
        to_translate,
        batch_size=batch_size,
        truncation=True,
        max_length=MAX_MODEL_LEN,  # generation cap (output)
    )

    # HF translation pipeline returns list of dicts: [{"translation_text": "..."}]
    for i, out in zip(idxs, outputs):
        translations[i] = out.get("translation_text", "")

    return translations


def translate_df(
    df: pl.DataFrame,
    text_col: str = "text",
    translation_col: str = "text_en",
    batch_size: int = 128,
) -> pl.DataFrame:
    """Translate `df[text_col]` into English and add a new column `translation_col`."""
    texts = df[text_col].to_list()
    translations = translate_list(texts, batch_size=batch_size)
    return df.with_columns(pl.Series(translation_col, translations))


def translate_func(input_path, output_path, part):
  # Load only part=1 of 2018–2019 tweets
    df = load_timestamp_text_2018_2019(input_path, part=part)
    print("Loaded shape:", df.shape)

    # Choose batch size based on device
    batch_size = 256
    print("Using batch size:", batch_size)

    # Chunking to be safe with memory and to keep things responsive
    chunk_size = 100000  # about 10 chunks for 207,198 rows
    translated_chunks = []

    for start in range(0, len(df), chunk_size):
        end = min(start + chunk_size, len(df))
        print(f"Translating rows {start}–{end}...")

        df_chunk = df.slice(start, end - start)
        df_chunk_translated = translate_df(
            df_chunk,
            text_col="text",
            translation_col="text_en",
            batch_size=batch_size,
        )
        translated_chunks.append(df_chunk_translated)

    # Concatenate all translated chunks
    df_all = pl.concat(translated_chunks)
    print("Final shape:", df_all.shape)
    print(df_all.head(5))

    # Save to CSV
    df_all.write_csv(output_path)
    print("Saved to:", output_path)

# ===== MAIN: LOAD, TRANSLATE IN CHUNKS, SAVE =====

input_path = "/content/drive/MyDrive/SDA/tweets_2018_2019.csv"
output_path = "/content/drive/MyDrive/SDA/tweets_test_1.csv"

translate_func("/content/drive/MyDrive/SDA/tweets_2018_2019.csv", "/content/drive/MyDrive/SDA/tweets_test_1.csv", 1)
translate_func("/content/drive/MyDrive/SDA/tweets_2018_2019.csv", "/content/drive/MyDrive/SDA/tweets_test_2.csv", 2)
translate_func("/content/drive/MyDrive/SDA/tweets_2018_2019.csv", "/content/drive/MyDrive/SDA/tweets_test_3.csv", 3)
translate_func("/content/drive/MyDrive/SDA/tweets_2018_2019.csv", "/content/drive/MyDrive/SDA/tweets_test_4.csv", 4)
translate_func("/content/drive/MyDrive/SDA/tweets_2018_2019.csv", "/content/drive/MyDrive/SDA/tweets_test_5.csv", 5)
translate_func("/content/drive/MyDrive/SDA/tweets_2018_2019.csv", "/content/drive/MyDrive/SDA/tweets_test_6.csv", 6)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CUDA available: True
Using device: 0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


Loaded shape: (207198, 2)
Using batch size: 256
Translating rows 0–100000...
Translating rows 100000–200000...
Translating rows 200000–207198...
Final shape: (207198, 3)
shape: (5, 3)
┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ timestamp           ┆ text                            ┆ text_en                         │
│ ---                 ┆ ---                             ┆ ---                             │
│ datetime[μs]        ┆ str                             ┆ str                             │
╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ 2018-01-01 00:00:01 ┆ Bitcoin - BTC                   ┆ Bitcoin - BTC Price: $13,941.1… │
│                     ┆ Price: $13,941.1…               ┆                                 │
│ 2018-01-01 00:00:01 ┆ 2018年01月01日 10:00            ┆ 01 January 2018 10:00 [DOGE] 1… │
│                     ┆ [DOGE建]                        ┆                           

Token indices sequence length is longer than the specified maximum sequence length for this model (661 > 512). Running this sequence through the model will result in indexing errors


Translating rows 100000–200000...
Translating rows 200000–300000...
Translating rows 300000–314636...
Final shape: (314636, 3)
shape: (5, 3)
┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ timestamp           ┆ text                            ┆ text_en                         │
│ ---                 ┆ ---                             ┆ ---                             │
│ datetime[μs]        ┆ str                             ┆ str                             │
╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ 2018-03-02 20:00:00 ┆ Today's 4pm ET auction: 246.00… ┆ Today's 4pm ET auction: 246.00… │
│ 2018-03-02 20:00:01 ┆ Bitcoin - BTC                   ┆ Bitcoin - BTC Price: $11,080.9… │
│                     ┆ Price: $11,080.9…               ┆                                 │
│ 2018-03-02 20:00:02 ┆ 03/03 06:00現在(Zaif調べ)       ┆ 03/03 06:00 now (Zaif survey) … │
│                     ┆            

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Translating rows 100000–200000...
Translating rows 200000–253018...
Final shape: (253018, 3)
shape: (5, 3)
┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ timestamp           ┆ text                            ┆ text_en                         │
│ ---                 ┆ ---                             ┆ ---                             │
│ datetime[μs]        ┆ str                             ┆ str                             │
╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ 2018-07-02 12:00:00 ┆ Bitcoin (4.02): $6,636.81       ┆ Bitcoin (4.02): $6,636.81 Ethe… │
│                     ┆ Ethe…                           ┆                                 │
│ 2018-07-02 12:00:01 ┆ 2018/07/02（月）23:00           ┆ 2018/07/02 (month) 23:00 Bitco… │
│                     ┆ ビットコインの価格は7…          ┆                                 │
│ 2018-07-02 12:00:01 ┆ Bitcoin - BTC                   ┆ Bitcoin - BTC Price:

# Sentiment Analysis all Reddit and Twitter messages


This script performs large-scale sentiment analysis on translated Reddit and Twitter datasets using the **CardiffNLP Twitter RoBERTa sentiment model**. The code is optimized for GPU execution (A100 in this case), leveraging **PyTorch**, **Transformers**, and **Polars** for efficient preprocessing and batching.

In [None]:
# ===== SETUP =====
!pip install -q polars transformers accelerate sentencepiece

import re
import torch
import polars as pl
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive

drive.mount("/content/drive")

# ===== DEVICE =====
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
dtype = torch.float16 if use_cuda else torch.float32
print("CUDA available:", use_cuda, "| device:", device)

# ===== SENTIMENT MODEL (GPU) =====
# Great for tweet-like text; works fine on translated reddit too.
SENTIMENT_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

sent_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL)
sent_model = AutoModelForSequenceClassification.from_pretrained(
    SENTIMENT_MODEL,
    torch_dtype=dtype if use_cuda else torch.float32,
).to(device)
sent_model.eval()

# Robust mapping to NEG/NEU/POS indices
id2label = {int(k): v for k, v in sent_model.config.id2label.items()}
labels_lower = {v.lower(): k for k, v in id2label.items()}
if {"negative", "neutral", "positive"}.issubset(labels_lower):
    NEG_ID = labels_lower["negative"]
    NEU_ID = labels_lower["neutral"]
    POS_ID = labels_lower["positive"]
else:
    # Common convention for this CardiffNLP model family
    NEG_ID, NEU_ID, POS_ID = 0, 1, 2

# Light cleanup (helps a bit, cheap)
_URL = re.compile(r"https?://\S+|www\.\S+")
_MENTION = re.compile(r"@\w+")
_WS = re.compile(r"\s+")

def clean_text(t: str) -> str:
    if not isinstance(t, str) or not t.strip():
        return ""
    t = _URL.sub(" ", t)
    t = _MENTION.sub(" ", t)
    return _WS.sub(" ", t).strip()

def sentiment_list_gpu(texts, batch_size: int = 1024, max_length: int = 256):
    """
    Returns:
      scores: list[float] in [-1, 1]  (P(pos) - P(neg))
      labels: list[str] in {"pos","neu","neg"}
    """
    n = len(texts)
    scores = [0.0] * n
    labels = ["neu"] * n

    idxs = [i for i, t in enumerate(texts) if isinstance(t, str) and t.strip()]
    if not idxs:
        return scores, labels

    cleaned = [clean_text(texts[i]) for i in idxs]

    for start in range(0, len(cleaned), batch_size):
        batch_texts = cleaned[start:start + batch_size]

        enc = sent_tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.inference_mode():
            logits = sent_model(**enc).logits
            probs = torch.softmax(logits, dim=-1).float().cpu()

        for j in range(probs.size(0)):
            p = probs[j]
            score = float(p[POS_ID] - p[NEG_ID])      # [-1, 1]
            pred = int(torch.argmax(p).item())
            lab = "pos" if pred == POS_ID else "neg" if pred == NEG_ID else "neu"

            orig_i = idxs[start + j]
            scores[orig_i] = score
            labels[orig_i] = lab

    return scores, labels

def add_sentiment_df(
    df: pl.DataFrame,
    text_col: str = "text_en",
    score_col: str = "sentiment_score",
    label_col: str = "sentiment_label",
    batch_size: int = 1024,
    max_length: int = 256,
) -> pl.DataFrame:
    texts = df[text_col].to_list()
    scores, labels = sentiment_list_gpu(texts, batch_size=batch_size, max_length=max_length)
    return df.with_columns(
        pl.Series(score_col, scores),
        pl.Series(label_col, labels),
    )

# ===== CHUNKED FILE PROCESSOR =====
def score_file_in_chunks(
    input_path: str,
    output_path: str,
    text_col: str = "text_en",
    chunk_size: int = 100_000,
    batch_size: int = 1024,
    max_length: int = 256,
):
    df = pl.read_csv(input_path)  # keep simple; if huge, run per-part files like you already do
    print("Loaded:", input_path, "shape:", df.shape)

    out_chunks = []
    for start in range(0, len(df), chunk_size):
        end = min(start + chunk_size, len(df))
        print(f"Scoring rows {start}–{end}...")

        chunk = df.slice(start, end - start)
        chunk = add_sentiment_df(
            chunk,
            text_col=text_col,
            batch_size=batch_size,
            max_length=max_length,
        )
        out_chunks.append(chunk)

    out = pl.concat(out_chunks)
    out.write_csv(output_path)
    print("Saved:", output_path, "shape:", out.shape)

# ===== EXAMPLE: score your 6 translated parts =====
for i in range(1, 7):
    inp = f"/content/drive/MyDrive/SDA/tweets_{i}.csv"          # contains text_en already
    out = f"/content/drive/MyDrive/SDA/tweets_{i}_sent.csv"
    score_file_in_chunks(inp, out, text_col="text_en", chunk_size=100_000, batch_size=1024, max_length=256)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CUDA available: True | device: cuda


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded: /content/drive/MyDrive/SDA/tweets_1.csv shape: (207198, 3)
Scoring rows 0–100000...
Scoring rows 100000–200000...
Scoring rows 200000–207198...
Saved: /content/drive/MyDrive/SDA/tweets_1_sent.csv shape: (207198, 5)
Loaded: /content/drive/MyDrive/SDA/tweets_2.csv shape: (314636, 3)
Scoring rows 0–100000...
Scoring rows 100000–200000...
Scoring rows 200000–300000...
Scoring rows 300000–314636...
Saved: /content/drive/MyDrive/SDA/tweets_2_sent.csv shape: (314636, 5)
Loaded: /content/drive/MyDrive/SDA/tweets_3.csv shape: (254821, 3)
Scoring rows 0–100000...
Scoring rows 100000–200000...
Scoring rows 200000–254821...
Saved: /content/drive/MyDrive/SDA/tweets_3_sent.csv shape: (254821, 5)
Loaded: /content/drive/MyDrive/SDA/tweets_4.csv shape: (253018, 3)
Scoring rows 0–100000...
Scoring rows 100000–200000...
Scoring rows 200000–253018...
Saved: /content/drive/MyDrive/SDA/tweets_4_sent.csv shape: (253018, 5)
Loaded: /content/drive/MyDrive/SDA/tweets_5.csv shape: (65364, 3)
Scoring rows