# Propaganda Paper — Telegram Analysis

End-to-end pipeline: scrape → filter → clean → lemmatize → analyse.

This notebook uses modules from the `src/` package. See `README.md` for setup instructions.


In [None]:
import pandas as pd

from src.config import (
    RAW_CSV, PRECLEANED_CSV, CLEAN_CSV,
    CYRILLIC_CSV, LATIN_CSV, GREEK_CSV,
    CYRILLIC_LEMMATIZED_CSV, GREEK_LEMMATIZED_CSV,
    WORD_FREQ_CSV, BIGRAMS_CSV, TRIGRAMS_CSV,
)
from src.scraping.telegram import scrape_channels
from src.preprocessing.filtering import filter_messages, tag_categories
from src.preprocessing.text_cleaning import clean_and_split
from src.analysis.lemmatization import lemmatize_column, lemmatize_greek_column
from src.analysis.frequency import ensure_list_column, word_frequency, compute_ngrams


## 1. Scrape Telegram channels


In [None]:
# Uncomment to run the scraper (requires Telegram credentials in .env)
# df_raw = await scrape_channels()

# Or load existing raw data:
df_raw = pd.read_csv(RAW_CSV)
print(f"Loaded {len(df_raw)} raw messages")
df_raw.head()

## 2. Filter & tag messages


In [None]:
df_filtered = filter_messages(df_raw)
df_tagged = tag_categories(df_filtered)
df_tagged.to_csv(PRECLEANED_CSV, index=False)
print(f"Saved {len(df_tagged)} pre-cleaned rows to {PRECLEANED_CSV}")

## 3. Clean text & split by script


In [None]:
df_clean = pd.read_csv(PRECLEANED_CSV)
df_all, df_ru, df_en, df_gr = clean_and_split(df_clean)

df_all.to_csv(CLEAN_CSV, index=False)
df_ru.to_csv(CYRILLIC_CSV, index=False)
df_en.to_csv(LATIN_CSV, index=False)
df_gr.to_csv(GREEK_CSV, index=False)
print(f"Saved clean data ({len(df_all)} total, {len(df_ru)} Russian, {len(df_en)} English, {len(df_gr)} Greek)")


## 4. Lemmatize Russian and Greek posts


In [None]:
# Russian (Cyrillic) — stanza ru pipeline
df_cyr = pd.read_csv(CYRILLIC_CSV)
df_cyr = lemmatize_column(df_cyr)
df_cyr.to_csv(CYRILLIC_LEMMATIZED_CSV, index=False)
print(f"Saved Russian lemmatized data to {CYRILLIC_LEMMATIZED_CSV}")

# Greek — stanza el pipeline
df_gr = pd.read_csv(GREEK_CSV)
df_gr = lemmatize_greek_column(df_gr)
df_gr.to_csv(GREEK_LEMMATIZED_CSV, index=False)
print(f"Saved Greek lemmatized data to {GREEK_LEMMATIZED_CSV}")


## 5. Word frequency & n-gram analysis


In [None]:
df_ru = pd.read_csv(CYRILLIC_LEMMATIZED_CSV)
df_ru['lemmas'] = ensure_list_column(df_ru['lemmas'])

# Word frequency
wf = word_frequency(df_ru['lemmas'])
wf.to_csv(WORD_FREQ_CSV, index=False)
print("Top 30 Russian words:")
print(wf.head(30).to_string(index=False))

# Bigrams & trigrams
bg = compute_ngrams(df_ru['lemmas'], n=2, min_freq=3)
tg = compute_ngrams(df_ru['lemmas'], n=3, min_freq=3)
bg.to_csv(BIGRAMS_CSV, index=False)
tg.to_csv(TRIGRAMS_CSV, index=False)

print("\nTop 20 bigrams:")
print(bg.head(20).to_string(index=False))
print("\nTop 20 trigrams:")
print(tg.head(20).to_string(index=False))

## 6. Propaganda classification (TODO)

Once a trained model and full corpus are available, import and use
`src.classification.model.predict` here.


In [None]:
# from src.classification.model import predict
# predictions = predict(df_ru['text_cleaned'].tolist())