## Imports

In [1]:
import json
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from collections import defaultdict
import threading
import dl_translate as dlt
import torch
import fasttext
from langdetect import detect
from googletrans import Translator
import threading

  from .autonotebook import tqdm as notebook_tqdm


## Data Loading

In [3]:
# Load first JSONL file as an example
parliament_transcripts_94_99 = []
with open("data/jsonl/parliament_transcripts_94_99.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_94_99.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_94_99)} entries")

Loaded 225 entries


In [4]:
print(json.dumps(parliament_transcripts_94_99[0], indent=2))

{
  "date": "07-05-1999",
  "link": "https://www.europarl.europa.eu/doceo/document/CRE-4-1999-05-07-ITM-001_EN.html",
  "sections": [
    {
      "topic": "",
      "paragraphs": [
        "Sitting of Friday, 7 May 1999",
        "IN THE CHAIR: MRS SCHLEICHER\nVice\u2010President",
        "(The sitting was opened at 9.05 a.m.)",
        "President . \u2013 It took a few moments before we could open the sitting, because I was afraid that the lights were going out all over Europe, but thankfully they have now come on again.",
        "Today we have the very last sitting of this parliamentary term and probably also the last sitting to be held in this Chamber. I think that many people will be glad if we can then move into a new Chamber, but for quite a few this is also their last day's sitting and it is no doubt an occasion tinged with sadness. I also see that a good many colleagues have already embarked on the election campaign and the House is no longer full to capacity. I should like t

In [5]:
# See first record
# print(json.dumps(data[0], indent=2))

# Or preview several records quickly
for entry in parliament_transcripts_94_99[:3]:
    print(entry)

{'date': '07-05-1999', 'link': 'https://www.europarl.europa.eu/doceo/document/CRE-4-1999-05-07-ITM-001_EN.html', 'sections': [{'topic': '', 'paragraphs': ['Sitting of Friday, 7 May 1999', 'IN THE CHAIR: MRS SCHLEICHER\nVice‐President', '(The sitting was opened at 9.05 a.m.)', 'President . – It took a few moments before we could open the sitting, because I was afraid that the lights were going out all over Europe, but thankfully they have now come on again.', "Today we have the very last sitting of this parliamentary term and probably also the last sitting to be held in this Chamber. I think that many people will be glad if we can then move into a new Chamber, but for quite a few this is also their last day's sitting and it is no doubt an occasion tinged with sadness. I also see that a good many colleagues have already embarked on the election campaign and the House is no longer full to capacity. I should like to thank all those who are staying on here today, the so‐called Friday club, 

In [6]:
# load the other 6 terms (JSONL files)
parliament_transcripts_99_04 = []
with open("data/jsonl/parliament_transcripts_99_04.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_99_04.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_99_04)} entries")


parliament_transcripts_04_09 = []
with open("data/jsonl/parliament_transcripts_04_09.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_04_09.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_04_09)} entries")


parliament_transcripts_09_14 = []
with open("data/jsonl/parliament_transcripts_09_14.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_09_14.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_09_14)} entries")


parliament_transcripts_14_19 = []
with open("data/jsonl/parliament_transcripts_14_19.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_14_19.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_14_19)} entries")


parliament_transcripts_19_24 = []
with open("data/jsonl/parliament_transcripts_19_24.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_19_24.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_19_24)} entries")


parliament_transcripts_24_25 = []
with open("data/jsonl/parliament_transcripts_24_25.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        parliament_transcripts_24_25.append(json.loads(line))

print(f"Loaded {len(parliament_transcripts_24_25)} entries")

Loaded 314 entries
Loaded 308 entries
Loaded 250 entries
Loaded 279 entries
Loaded 272 entries
Loaded 58 entries


This is okay but we want a different format. More complex preprocessing steps are required.

## Data Preprocessing

In order to have the data from the JSONL file in the right format, we need to:
- first translate any remaining non-english texts
- remove all topics where topic text is null or empty
- remove all topics with 0 or just 1 paragraph inside the paragraphs list, such as this example because it is worthless:

"topic": "1. Opening of the sitting",
      "paragraphs": [
        "(The daily session started at 8:30.)"
      ]
- remove duplicate topics if there are some with the same title, unless their paragraphs are different, in that case i want to merge them into one
- remove all empty paragraphs or paragraphs containing just special characters or stopwords and not actual words (because there are filler paragraphs like "***" or "...")
- finally merge all paragraphs for one topic together and concatenate them with a space " ", and inside the dataframe call that column text
- for the dataframe drop / dont include the link, so i just want columns date, topic, text (concatenated paragraphs of one topic)

### Translation using HuggingFace Transformers

This approach was not used in the end as it did not provide an improvement.

In [6]:
# use a dl translator model (default model is m2m100)
model = dlt.TranslationModel(device="cuda")

In [7]:
print(torch.__version__)

2.6.0+cu118


In [8]:
print(torch.cuda.is_available())    # Should be True
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4070 Laptop GPU


In [25]:
fasttext_lang_model = fasttext.load_model("lid.176.ftz")

In [26]:
def fasttext_detect(text):
    try:
        lang = fasttext_lang_model.predict(text.strip().replace("\n", " "), k=1)[0][0]
        return lang.replace("__label__", "")
    except:
        return "unknown"

In [27]:
sample_text = "Paldies par sadarbību! Mēs esam gandarīti par Latvijas prezidentūras sasniegumiem."

# Detect the language using fastText
detected_source = fasttext_detect(sample_text)
print("Detected source:", detected_source)

# Translate using the detected source
translated_sample = model.translate(
    sample_text,
    source=detected_source,
    target="en"
)

print("Translated:", translated_sample)

Detected source: lv
Translated: Thank you for cooperation!We are pleased with the achievements of the Presidency of Latvia.


In [36]:
def translate_term_data(term_data, model, batch_size=32, verbose=True):
    for item in term_data:
        if verbose:
            print(f"  Translating date '{item['date']}'")
        for section in item["sections"]:
            paragraphs = section.get("paragraphs", [])
            if not paragraphs:
                continue

            # Clean and filter weird/empty paragraphs
            cleaned_paragraphs = []
            detected_sources = []
            valid_indices = []

            for idx, p in enumerate(paragraphs):
                p_clean = p.strip()
                if not p_clean or p_clean in {"...", ".", "-", "–", "—"}:
                    continue
                cleaned_paragraphs.append(p_clean)
                detected_sources.append(fasttext_detect(p_clean))
                valid_indices.append(idx)

            # Translate in batches
            translated = paragraphs.copy()
            for i in range(0, len(cleaned_paragraphs), batch_size):
                batch = cleaned_paragraphs[i:i + batch_size]
                batch_sources = detected_sources[i:i + batch_size]

                translated_batch = [
                    model.translate(p, source=src, target="en")
                    for p, src in zip(batch, batch_sources)
                ]

                for rel_idx, trans in enumerate(translated_batch):
                    translated[valid_indices[i + rel_idx]] = trans

            # Overwrite original paragraphs
            section["paragraphs"] = translated


In [50]:
def translate_term_data_grouped(term_data, model, batch_size=32, verbose=True):
    for item in term_data:
        if verbose:
            print(f"\nTranslating date '{item['date']}'")

        for section in item["sections"]:
            paragraphs = section.get("paragraphs", [])
            if not paragraphs:
                continue

            indexed_paragraphs = []
            for idx, p in enumerate(paragraphs):
                p = p.strip()
                if len(p) < 3 or p in {"...", "***"}:
                    continue
                try:
                    lang = fasttext_detect(p)
                    indexed_paragraphs.append((idx, p, lang))
                except Exception as e:
                    if verbose:
                        print(f"  Skipping paragraph (index {idx}): lang detect failed -> {e}")
                    continue

            if verbose:
                print(f"  Detected {len(indexed_paragraphs)} paragraphs to translate")

            # Group by language
            lang_groups = {}
            for idx, p, lang in indexed_paragraphs:
                lang_groups.setdefault(lang, []).append((idx, p))

            translated_final = paragraphs.copy()

            for lang, group in lang_groups.items():
                idxs, texts = zip(*group)

                try:
                    translated = model.translate(
                        list(texts),
                        source=lang,
                        target="en",
                        batch_size=batch_size,
                        verbose=verbose
                    )
                    for i, t in zip(idxs, translated):
                        translated_final[i] = t
                except Exception as e:
                    if verbose:
                        print(f"  Failed to translate group (lang={lang}) -> {e}")

            section["paragraphs"] = translated_final



In [58]:
term_data_test = [
    {
        "date": "02-02-2024",
        "link": "https://example.com/session2",
        "sections": [
            {
                "topic": "Session Introduction",
                "paragraphs": [
                    "Bienvenue à tous. Nous sommes réunis aujourd'hui pour discuter des politiques de transport durable.",
                    "Es ist entscheidend, dass wir unsere Ziele für 2030 mit Entschlossenheit und Klarheit verfolgen.",
                    "Gracias por su asistencia. El cambio climático exige respuestas rápidas y coordinadas.",
                    "Mēs esam vienojušies sākt jaunu iniciatīvu pilsētas mobilitātei."
                ]
            },
            {
                "topic": "Digital Infrastructure",
                "paragraphs": [
                    "L'innovation numérique est au cœur de la transformation économique de l'Union européenne.",
                    "Die Mitgliedstaaten müssen gemeinsam in digitale Infrastrukturen investieren.",
                    "Abbiamo bisogno di una visione comune per il futuro digitale dell'Europa.",
                    "Dziękujemy za możliwość zabrania głosu w tej ważnej sprawie."
                ]
            },
            {
                "topic": "One English Comment",
                "paragraphs": [
                    "Thank you for allowing me to briefly comment on this."
                ]
            }
        ]
    }
]


In [38]:
translate_term_data(term_data_test, model)

  Translating date '02-02-2024'


In [52]:
translate_term_data_grouped(term_data_test, model)


Translating date '02-02-2024'
  Detected 4 paragraphs to translate


100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  3.71it/s]
100%|██████████| 1/1 [00:00<00:00,  2.78it/s]


  Detected 4 paragraphs to translate


100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
100%|██████████| 1/1 [00:00<00:00,  4.13it/s]
100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]


  Detected 1 paragraphs to translate


100%|██████████| 1/1 [00:00<00:00,  3.35it/s]


In [53]:
for section in term_data_test[0]["sections"]:
    print(f"\n--- {section['topic']} ---")
    for p in section["paragraphs"]:
        print(p)


--- Session Introduction ---
We are gathered today to discuss sustainable transport policy.
It is crucial that we pursue our 2030 goals with determination and clarity.
Climate change requires quick and coordinated responses.
We have agreed to launch a new initiative for urban mobility.

--- Digital Infrastructure ---
Digital innovation is at the heart of the European Union’s economic transformation.
Member States must invest in digital infrastructure together.
We need a common vision for the digital future of Europe.
Thank you for the possibility of voting on this important issue.

--- One English Comment ---
Thank you for allowing me to briefly comment on this.


Now we can translate the actual data. We will translate each term one by one so we can choose which one to translate.
The first 4 terms should be translated already as it was done during the web scraping process.
The second half of the 5th and the last two terms should be translated here.
NOTE: googletrans is not working properly especially with these large texts, that is why we are using a transformer model from dl_translate.

In [61]:
translate_term_data_grouped(parliament_transcripts_94_99, model)


Translating date '07-05-1999'
  Detected 24 paragraphs to translate


100%|██████████| 1/1 [07:10<00:00, 430.98s/it]
100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
100%|██████████| 1/1 [00:00<00:00,  2.05it/s]


  Detected 40 paragraphs to translate


  0%|          | 0/2 [04:48<?, ?it/s]


KeyboardInterrupt: 

### Translation using googletrans and langdetect

In [10]:
translator = Translator()

def detect_and_translate_paragraphs(term_data):
    last_date = None

    for item in term_data:
        date = item["date"]
        if date != last_date:
            print(f"Translating sections for new date: {date}")
            last_date = date

        for section in item.get("sections", []):
            paragraphs = section.get("paragraphs", [])
            new_paragraphs = []

            for p in paragraphs:
                if not isinstance(p, str) or not p.strip() or p.strip() in {".", "...", "---", "—", "***"}:
                    new_paragraphs.append(p)
                    continue

                try:
                    lang = detect(p)
                    if lang != "en":
                        translated = translator.translate(p, src=lang, dest="en").text
                        new_paragraphs.append(translated)
                    else:
                        new_paragraphs.append(p)
                except Exception as e:
                    print(f"  Translation failed on {date}: {e}")
                    new_paragraphs.append(p)

            section["paragraphs"] = new_paragraphs

In [68]:
test_term_data = [
    {
        "date": "01-01-2024",
        "link": "https://example.com",
        "sections": [
            {
                "topic": "Opening Remarks",
                "paragraphs": [
                    "Bienvenue à tous dans cette session importante.",
                    "Danke für Ihre Aufmerksamkeit.",
                    "Thank you for being here.",
                    "..."
                ]
            },
            {
                "topic": "Climate Policy",
                "paragraphs": [
                    "La politique climatique doit être renforcée.",
                    "The climate strategy must be strengthened.",
                    ""
                ]
            }
        ]
    }
]


In [69]:
detect_and_translate_paragraphs(test_term_data)

Translating sections for new date: 01-01-2024


In [72]:
for section in test_term_data[0]["sections"]:
    print(f"\n--- {section['topic']} ---")
    for p in section["paragraphs"]:
        print(p)



--- Opening Remarks ---
Welcome everyone to this important session.
Thanks for your attention.
Thank you for being here.
...

--- Climate Policy ---
Climate policy must be reinforced.
The climate strategy must be strengthened.



In [79]:
detect_and_translate_paragraphs(parliament_transcripts_14_19)


Translating sections for new date: 18-04-2019
  Translation failed on 18-04-2019: No features in text.
  Translation failed on 18-04-2019: No features in text.
Translating sections for new date: 17-04-2019
  Translation failed on 17-04-2019: The read operation timed out
Translating sections for new date: 16-04-2019
  Translation failed on 16-04-2019: The read operation timed out
Translating sections for new date: 15-04-2019
Translating sections for new date: 04-04-2019
Translating sections for new date: 03-04-2019
Translating sections for new date: 28-03-2019
  Translation failed on 28-03-2019: No features in text.
Translating sections for new date: 27-03-2019
  Translation failed on 27-03-2019: No features in text.
  Translation failed on 27-03-2019: No features in text.
Translating sections for new date: 26-03-2019
Translating sections for new date: 25-03-2019
  Translation failed on 25-03-2019: No features in text.
  Translation failed on 25-03-2019: No features in text.
Translating

In [None]:
detect_and_translate_paragraphs(parliament_transcripts_19_24)

Translating sections for new date: 25-04-2024
  Translation failed on 25-04-2024: The read operation timed out
  Translation failed on 25-04-2024: timed out
  Translation failed on 25-04-2024: No features in text.
Translating sections for new date: 24-04-2024
  Translation failed on 24-04-2024: The read operation timed out
  Translation failed on 24-04-2024: the JSON object must be str, bytes or bytearray, not NoneType
  Translation failed on 24-04-2024: No features in text.
  Translation failed on 24-04-2024: No features in text.
  Translation failed on 24-04-2024: No features in text.
Translating sections for new date: 23-04-2024
  Translation failed on 23-04-2024: No features in text.
  Translation failed on 23-04-2024: The read operation timed out
  Translation failed on 23-04-2024: the JSON object must be str, bytes or bytearray, not NoneType
  Translation failed on 23-04-2024: The read operation timed out
  Translation failed on 23-04-2024: timed out
Translating sections for new 

In [73]:
detect_and_translate_paragraphs(parliament_transcripts_24_25)

Translating sections for new date: 10-07-2025
  Translation failed on 10-07-2025: No features in text.
Translating sections for new date: 09-07-2025
  Translation failed on 09-07-2025: The read operation timed out
  Translation failed on 09-07-2025: 'NoneType' object is not iterable
Translating sections for new date: 08-07-2025
Translating sections for new date: 07-07-2025
  Translation failed on 07-07-2025: No features in text.
Translating sections for new date: 19-06-2025
Translating sections for new date: 18-06-2025
  Translation failed on 18-06-2025: the JSON object must be str, bytes or bytearray, not NoneType
Translating sections for new date: 17-06-2025
Translating sections for new date: 16-06-2025
  Translation failed on 16-06-2025: No features in text.
Translating sections for new date: 22-05-2025
Translating sections for new date: 21-05-2025
  Translation failed on 21-05-2025: No features in text.
  Translation failed on 21-05-2025: the JSON object must be str, bytes or bytea

### Forming a DataFrame

In [8]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bakir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def is_valid_paragraph(p):
    """Check if paragraph is not empty, not just symbols, and has more than just stopwords."""
    text = p.strip()
    if not text or re.fullmatch(r"[^a-zA-Z]+", text):
        return False
    words = re.findall(r'\b\w+\b', text.lower())
    return any(w not in stop_words for w in words)

In [10]:
def preprocess_transcripts(data):
    rows = []

    for entry in data:
        date = entry["date"]
        topic_dict = defaultdict(list)

        # this skips the last section in each entry because the last one usually contains
        # just the final words for the closure of the sitting
        sections = entry.get("sections", [])[:-1]

        for section in sections:
            topic = section.get("topic")
            topic = topic.strip() if isinstance(topic, str) else ""
            paragraphs = section.get("paragraphs", [])

            valid_paragraphs = [p for p in paragraphs if is_valid_paragraph(p)]
            if not topic or len(valid_paragraphs) <= 1:
                continue

            topic_dict[topic].extend(valid_paragraphs)

        for topic, paras in topic_dict.items():
            unique_paras = list(dict.fromkeys(p.strip() for p in paras if is_valid_paragraph(p)))
            if len(unique_paras) > 1:
                full_text = " ".join(unique_paras)
                rows.append({
                    "date": date,
                    "topic": topic,
                    "text": full_text
                })

    return pd.DataFrame(rows)

In [None]:
# apply preprocessing to each term's data separately
# not in a for loop so it is done separately and some terms can be skipped if needed
# parliament_transcripts_94_99_df = preprocess_transcripts(parliament_transcripts_94_99)
# parliament_transcripts_99_04_df = preprocess_transcripts(parliament_transcripts_99_04)
# parliament_transcripts_04_09_df = preprocess_transcripts(parliament_transcripts_04_09)
# parliament_transcripts_09_14_df = preprocess_transcripts(parliament_transcripts_09_14)
# parliament_transcripts_14_19_df = preprocess_transcripts(parliament_transcripts_14_19)
# parliament_transcripts_19_24_df = preprocess_transcripts(parliament_transcripts_19_24)
# parliament_transcripts_24_25_df = preprocess_transcripts(parliament_transcripts_24_25)


In [15]:
parliament_transcripts_94_99_df = preprocess_transcripts(parliament_transcripts_94_99)

In [16]:
parliament_transcripts_99_04_df = preprocess_transcripts(parliament_transcripts_99_04)

In [17]:
parliament_transcripts_04_09_df = preprocess_transcripts(parliament_transcripts_04_09)

In [18]:
parliament_transcripts_09_14_df = preprocess_transcripts(parliament_transcripts_09_14)

In [None]:
parliament_transcripts_14_19_df = preprocess_transcripts(parliament_transcripts_14_19)

In [11]:
parliament_transcripts_19_24_df = preprocess_transcripts(parliament_transcripts_19_24)

In [None]:
parliament_transcripts_24_25_df = preprocess_transcripts(parliament_transcripts_24_25)

In [19]:
# example dataframe structure
parliament_transcripts_94_99_df.head()

Unnamed: 0,date,topic,text
0,07-05-1999,1. VOTES,President . – We shall now move on to the vote...
1,07-05-1999,2. Potato starch,President . – The next item is the proposal fo...
2,07-05-1999,3. Hannover 2000,President . – The next item is the report (A4‐...
3,07-05-1999,4. European textiles market,President . – The next item is the joint debat...
4,06-05-1999,1. Approval of the Minutes,President . – The Minutes of yesterday's sitti...


In [20]:
parliament_transcripts_94_99_df.info()
parliament_transcripts_94_99_df.describe(include='all')  # for categorical overview
parliament_transcripts_94_99_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1912 entries, 0 to 1911
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    1912 non-null   object
 1   topic   1912 non-null   object
 2   text    1912 non-null   object
dtypes: object(3)
memory usage: 44.9+ KB


date     0
topic    0
text     0
dtype: int64

In [21]:
# get text column of first row
first_row_text = parliament_transcripts_94_99_df.iloc[0]["text"]
print(first_row_text)

President . – We shall now move on to the votes. Proposal for a Council Regulation (ECSC, EC, EURATOM) incorporating daily subsistence allowance rates for officials on mission within the European territory of the Member States of the European Union for Austria, Finland and Sweden into Article 13 of Annex VII to the Staff Regulations of officials of the European Communities (COM(99)0133 – C4‐0226/99‐99/0076(CNS)) (Parliament approved the Commission proposal) Proposal for a Council Regulation amending Regulation (EC, Euratom, ECSC) No 259/68 laying down the Staff Regulations of Officials of the European Communities and the Conditions of Employment of Other Servants of the Communities (COM(99)0102 – C4‐0159/99‐99/0065(CNS)) President . – The next item is in principle the report by Mr Jarzembowski. However, I have to tell the House that we have not yet received the opinion of the Committee of the Regions. With your agreement, I should like to defer this item. We are waiting for the informa

## CSV File Creation

In [22]:
parliament_transcripts_94_99_df.to_csv("data/csv/parliament_transcripts_94_99.csv", index=False)

In [23]:
parliament_transcripts_99_04_df.to_csv("data/csv/parliament_transcripts_99_04.csv", index=False)

In [24]:
parliament_transcripts_04_09_df.to_csv("data/csv/parliament_transcripts_04_09.csv", index=False)

In [25]:
parliament_transcripts_09_14_df.to_csv("data/csv/parliament_transcripts_09_14.csv", index=False)

In [None]:
parliament_transcripts_14_19_df.to_csv("data/csv/parliament_transcripts_14_19.csv", index=False)

In [12]:
parliament_transcripts_19_24_df.to_csv("data/csv/parliament_transcripts_19_24.csv", index=False)

In [None]:
parliament_transcripts_24_25_df.to_csv("data/csv/parliament_transcripts_24_25.csv", index=False)

parliament_transcripts_24_25_df saved as CSV file.


## Combined CSV Creation

In [2]:
# load separate terms if needed (if you want to skip the previous steps)
parliament_transcripts_94_99_df = pd.read_csv("data/csv/parliament_transcripts_94_99.csv")
parliament_transcripts_99_04_df = pd.read_csv("data/csv/parliament_transcripts_99_04.csv")
parliament_transcripts_04_09_df = pd.read_csv("data/csv/parliament_transcripts_04_09.csv")
parliament_transcripts_09_14_df = pd.read_csv("data/csv/parliament_transcripts_09_14.csv")
parliament_transcripts_14_19_df = pd.read_csv("data/csv/parliament_transcripts_14_19.csv")
parliament_transcripts_19_24_df = pd.read_csv("data/csv/parliament_transcripts_19_24.csv")
parliament_transcripts_24_25_df = pd.read_csv("data/csv/parliament_transcripts_24_25.csv")

In [3]:
# this is to create a combined CSV file with all terms
# this file should be used for topic modeling
# can also be used for other analyses


# add term column to each dataframe
parliament_transcripts_94_99_df['term'] = '94_99'
parliament_transcripts_99_04_df['term'] = '99_04'
parliament_transcripts_04_09_df['term'] = '04_09'
parliament_transcripts_09_14_df['term'] = '09_14'
parliament_transcripts_14_19_df['term'] = '14_19'
parliament_transcripts_19_24_df['term'] = '19_24'
parliament_transcripts_24_25_df['term'] = '24_25'

combined_df = pd.concat([
    parliament_transcripts_94_99_df,
    parliament_transcripts_99_04_df,
    parliament_transcripts_04_09_df,
    parliament_transcripts_09_14_df,
    parliament_transcripts_14_19_df,
    parliament_transcripts_19_24_df,
    parliament_transcripts_24_25_df
])

combined_df.to_csv("data/csv/combined_terms.csv", index=False)

print("Combined CSV file created successfully.")

Combined CSV file created successfully.


In [6]:
combined_df.head()

Unnamed: 0,date,topic,text,term
0,07-05-1999,1. VOTES,President . – We shall now move on to the vote...,94_99
1,07-05-1999,2. Potato starch,President . – The next item is the proposal fo...,94_99
2,07-05-1999,3. Hannover 2000,President . – The next item is the report (A4‐...,94_99
3,07-05-1999,4. European textiles market,President . – The next item is the joint debat...,94_99
4,06-05-1999,1. Approval of the Minutes,President . – The Minutes of yesterday's sitti...,94_99
