In [1]:
import os
import re
import csv
import pandas as pd

import time
import requests
from bs4 import BeautifulSoup

from langdetect import detect
from deep_translator import GoogleTranslator

In [None]:
BASE_URL = "https://www.consiglio.vda.it/app/oggettidelconsiglio/dettaglio?pk_documento={}&versione=R"

OUTPUT_FOLDER = "./downloads"
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

#funzione per scraperare e salvare i resoconti dal sito del consiglio Valle
def scrape_and_save(doc_id):
    url = BASE_URL.format(doc_id)

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Documento {doc_id} non trovato (HTTP {response.status_code}).")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    page_text = soup.get_text(separator="\n", strip=True)

    output_file = os.path.join(OUTPUT_FOLDER, f"{doc_id}.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(page_text)
    
    print(f"Documento {doc_id} salvato in: {output_file}")
    return output_file


def main(start_id, end_id):
    csv_data = []
    print(f"Inizio lo scraping per i documenti dal {start_id} al {end_id}...")

    for doc_id in range(start_id, end_id + 1):
        print(f"Processo il documento {doc_id}...")
        file_path = scrape_and_save(doc_id)

        if file_path:
            csv_data.append({"ID_file": doc_id, "path_src": file_path})

        # Pausa per evitare di sovraccaricare il server (ad esempio 2-5 secondi)
        time.sleep(2 + (3 * doc_id % 5))  # Variamo la pausa per evitare un pattern troppo prevedibile

    # Salvataggio dei risultati nel file CSV
    csv_file = "csv_paths.csv"
    print(f"Salvataggio dei risultati nel file CSV: {csv_file}")
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["ID_file", "path_src"])
        writer.writeheader()
        writer.writerows(csv_data)

    print("Completato! Tutti i documenti salvati nella cartella:", OUTPUT_FOLDER)

if __name__ == "__main__":
    main(start_id=47950, end_id=48020)

In [2]:
#funzione per pulire i file txt dalle parti inutili presenti nella pagina web del consiglio Valle
def first_clean(target_folder, keyword, mid_strings, end_keyword, csv_paths):
    df = pd.read_csv(csv_paths)

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    df["path_clean"] = ""

    for idx, row in df.iterrows():
        source_path = row["path_src"]
        if source_path.endswith(".txt"):
            filename = os.path.basename(source_path)
            target_path = os.path.join(target_folder, filename)

            df.at[idx, "path_clean"] = target_path

            with open(source_path, "r", encoding="utf-8") as file:
                content = file.read()

            if keyword in content:
                content = keyword + content.split(keyword, 1)[1] 

            for mid_string in mid_strings:
                content = re.sub(re.escape(mid_string), "", content) 

            if end_keyword in content:
                content = content.split(end_keyword, 1)[0]  

            with open(target_path, "w", encoding="utf-8") as file:
                file.write(content)

    df.to_csv(csv_paths, index=False)

#funzione per estrarre informazioni dai file txt: oggetto, legislatura e classificazione
def first_info(pattern1, pattern2, pattern3, csv_paths):
    df_paths = pd.read_csv(csv_paths)

    df_paths['language'] = ""
    df_paths['object'] = ""
    df_paths['legislature'] = ""
    df_paths['class'] = ""

    for idx, row in df_paths.iterrows():
        source_path = row["path_clean"]
        if source_path.endswith(".txt"):
            filename = os.path.basename(source_path)
            filenum = int(os.path.splitext(filename)[0])

            with open(source_path, "r", encoding="utf-8") as file:
                content = file.read()

            match1 = re.search(pattern1, content)

            if match1:
                stringa = match1.group(1).strip()
                ogg, leg = stringa.split("/", 1)

                df_paths.loc[df_paths['ID_file'] == filenum, 'object'] = ogg
                df_paths.loc[df_paths['ID_file'] == filenum, 'legislature'] = leg

            match2 = re.search(pattern2, content, re.DOTALL)
            
            if match2:
                classe = match2.group(1).strip()
                classe = classe.replace("\n", ", ").strip()
                df_paths.loc[df_paths['ID_file'] == filenum, 'class'] = classe

            try:
                lang = detect(content)
                if lang == 'fr':
                    df_paths.loc[df_paths['ID_file'] == filenum, 'language'] = 'fr'
                elif lang == 'it':
                    df_paths.loc[df_paths['ID_file'] == filenum, 'language'] = 'it'
                else:
                    df_paths.loc[df_paths['ID_file'] == filenum, 'language'] = 'other'
            except:
                df_paths.loc[df_paths['ID_file'] == filenum, 'language'] = 'error'

    df_paths.to_csv(csv_paths, index=False)

In [3]:
target_folder = "./clean"

csv_paths = "./csv_paths.csv"
csv_details = "./csv_details.csv"
csv_cons = "./csv_cons.csv"
csv_chunks = "./csv_chunks.csv"

In [4]:
keyword = "Classificazione"
mid_strings = ["Precedente", "Successivo", "Resoconto integrale del dibattito dell'aula. I documenti allegati sono reperibili nel link \"iter atto\"."]
end_keyword = "Informativa cookies"

first_clean(target_folder, keyword, mid_strings, end_keyword, csv_paths)

pattern1 = r'(?:OGGETTO N\.|OBJET N°)(.*?)\s*-'
pattern2 = r'Classificazione\s*(.*?)\s*Oggetto'
pattern3 = r'object'

first_info(pattern1, pattern2, pattern3, csv_paths)

In [5]:
#funzione per stabilire i nomi dei consiglieri presenti nella legislatura presa in considerazione
def define_names(csv_cons, leg):
    df_cons = pd.read_csv(csv_cons)
    names = []

    print(f"Filtrando per legislatura: {leg}")
    filtered_cons = df_cons[df_cons['legislature'].str.contains(leg, case=False, na=False)]
    
    surname_counts = filtered_cons['surname'].value_counts()

    for idx, row in filtered_cons.iterrows():
        surname = row["surname"]
        name = row["name"]

        if surname_counts[surname] > 1:
            names.append(f"{surname} {name[0]}.")
        else:
            names.append(surname)

    print(f"Numero di nomi trovati per la legislatura {leg}: {len(names)}")
    return filtered_cons, names

#funzione per isolare i singoli interventi, specificando chi interviene (con relativi dettagli) e il contenuto dell'intervento
def isolate_chunk(csv_paths, csv_cons):
    df_paths = pd.read_csv(csv_paths)
    chunk_list = []

    for idx, row in df_paths.iterrows():
        leg = row["legislature"]
        language = row["language"]
        
        print(f"Processando file {row['path_clean']} per la legislatura {leg}...")
        
        try:
            df_cons, list_cons = define_names(csv_cons, leg)

            with open(row["path_clean"], 'r') as f:
                text = f.read()

                all_matches = []
                for name in list_cons:
                    pattern = r"(?i)" + re.escape(name) + r"\s?\([^\)]*\)\s?-"
                    matches = list(re.finditer(pattern, text))
                    all_matches.extend(matches)

                president_pattern = r"(?i)Presidente\s?-"
                president_matches = list(re.finditer(president_pattern, text))
                all_matches.extend(president_matches)

                all_matches.sort(key=lambda match: match.start())

                president_data = None
                if all_matches:
                    first_match = all_matches[0]
                    if re.match(president_pattern, first_match.group(0), re.IGNORECASE):
                        president_data = None
                    else:
                        surname = first_match.group(0)
                        words = surname.split()
                        surname = " ".join(words[:2])
                        if (words[1].endswith('.') or words[1].startswith('(')):
                            surname = words[0]
                        person_info = df_cons[df_cons["surname"] == surname].iloc[0]
                        president_data = {
                            "surname": person_info["surname"],
                            "name": person_info["name"],
                            "year_birth": person_info["year_birth"],
                            "gender": person_info["gender"],
                            "group": 'Presidente'
                        }

                chunk_idx = 1
                for i, match in enumerate(all_matches):
                    start_pos = match.end() 
                    end_pos = all_matches[i + 1].start() if i + 1 < len(all_matches) else len(text)

                    chunk = text[start_pos:end_pos].replace("\n", " ").strip()

                    if chunk: 
                        chunk = chunk.strip()
                        if re.match(president_pattern, match.group(0), re.IGNORECASE):
                            if president_data:
                                surname = president_data["surname"]
                                first_name = president_data["name"]
                                year_birth = president_data["year_birth"]
                                gender = president_data["gender"]
                                group = president_data["group"]
                            else:
                                surname = "Presidente"
                                first_name = "N/A"
                                year_birth = "N/A"
                                gender = "N/A"
                                group = "N/A"
                        else:
                            surname = match.group(0)
                            party = re.search(r'\((.*?)\)', surname)
                            group = party.group(1) if party else "N/A"
                            words = surname.split()
                            surname = " ".join(words[:2])
                            if (words[1].endswith('.') or words[1].startswith('(')):
                                surname = words[0]
                            try:
                                person_info = df_cons[df_cons["surname"] == surname].iloc[0]
                                first_name = person_info["name"]
                                year_birth = person_info["year_birth"]
                                gender = person_info["gender"]
                            except IndexError:
                                first_name = "N/A"
                                year_birth = "N/A"
                                gender = "N/A"
                                surname = "N/A"
                                group = "N/A"

                        chunk_list.append({
                            "ID_file": row["ID_file"],
                            "leg": leg,
                            "class": row["class"],
                            "language": language,
                            "surname": surname,
                            "name": first_name,
                            "year_birth": year_birth,
                            "gender": gender,
                            "group": group,
                            "position": chunk_idx,
                            "length": len(chunk),
                            "chunk": chunk
                        })

                        chunk_idx += 1
            print('File ok')

        except Exception as e:
            print(f"Errore nel file {row['path_clean']}: {e}. Saltando questa entry.")
            continue

    df_chunks = pd.DataFrame(chunk_list)
    print(f"Numero totale di chunk processati: {len(chunk_list)}")
    return df_chunks

df = isolate_chunk(csv_paths, csv_cons)

print("Salvataggio del file csv_chunks.csv...")
df.to_csv("csv_chunks.csv", index=False)
print("File salvato con successo!")

Processando file ./clean/47950.txt per la legislatura XVI...
Filtrando per legislatura: XVI
Numero di nomi trovati per la legislatura XVI: 38
File ok
Processando file ./clean/47951.txt per la legislatura XVI...
Filtrando per legislatura: XVI
Numero di nomi trovati per la legislatura XVI: 38
Errore nel file ./clean/47951.txt: single positional indexer is out-of-bounds. Saltando questa entry.
Processando file ./clean/47952.txt per la legislatura XVI...
Filtrando per legislatura: XVI
Numero di nomi trovati per la legislatura XVI: 38
File ok
Processando file ./clean/47953.txt per la legislatura XVI...
Filtrando per legislatura: XVI
Numero di nomi trovati per la legislatura XVI: 38
File ok
Processando file ./clean/47954.txt per la legislatura XVI...
Filtrando per legislatura: XVI
Numero di nomi trovati per la legislatura XVI: 38
File ok
Processando file ./clean/47955.txt per la legislatura XVI...
Filtrando per legislatura: XVI
Numero di nomi trovati per la legislatura XVI: 38
File ok
Proces

In [6]:
#funzione per tradurre i chunk da francese a italiano
def translate(csv_chunks):
    df = pd.read_csv(csv_chunks)
    
    filtered_df = df[df['language'] == 'fr']
    
    new_rows = []
    
    for index, row in filtered_df.iterrows():
        translated_chunk = GoogleTranslator(source='fr', target='it').translate(row['chunk'])
        
        new_row = row.copy()
        new_row['language'] = 'it'
        new_row['chunk'] = translated_chunk  

        new_rows.append(new_row)

    new_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_df], ignore_index=True)
    
    return df

df = translate("csv_chunks.csv")

filtered_df = df[df['language'] == 'it']
filtered_df.to_csv("csv_chunks_it.csv", index=False)

In [7]:
def split_text(text, max_length=4500):
    """Divide un testo in blocchi di lunghezza massima max_length."""
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def translate_text(text, source_lang, target_lang):
    """Traduce un testo lungo gestendo il limite di lunghezza."""
    chunks = split_text(text, max_length=4500)
    translated_chunks = [
        GoogleTranslator(source=source_lang, target=target_lang).translate(chunk)
        for chunk in chunks
    ]
    return ''.join(translated_chunks)

def translate_en(csv_chunks):
    df = pd.read_csv(csv_chunks)
    
    filtered_fr = df[df['language'] == 'fr']
    filtered_it = df[df['language'] == 'it']
    
    new_fr_rows = []
    new_it_rows = []
    
    for index, row in filtered_fr.iterrows():
        translated_chunk = translate_text(row['chunk'], source_lang='fr', target_lang='en')
        new_row = row.copy()
        new_row['language'] = 'en'
        new_row['chunk'] = translated_chunk
        new_fr_rows.append(new_row)

    for index, row in filtered_it.iterrows():
        translated_chunk = translate_text(row['chunk'], source_lang='it', target_lang='en')
        new_row = row.copy()
        new_row['language'] = 'en'
        new_row['chunk'] = translated_chunk
        new_it_rows.append(new_row)

    new_df = pd.concat([pd.DataFrame(new_fr_rows), pd.DataFrame(new_it_rows)], ignore_index=True)
    
    return new_df

df = translate_en("csv_chunks.csv")

df.to_csv("csv_chunks_en.csv", index=False)

In [8]:
df = pd.read_csv("csv_chunks_en.csv")

df_filtered = df[df['group'] != 'Presidente']

df_filtered.to_csv("csv_chunks_en_filtered.csv", index=False)

df = pd.read_csv("csv_chunks_it.csv")

df_filtered = df[df['group'] != 'Presidente']

df_filtered.to_csv("csv_chunks_it_filtered.csv", index=False)