# Nuages de mots

In [None]:
from collections import Counter
from wordcloud import WordCloud
import os
import nltk
from nltk.corpus import stopwords
from IPython.display import Image, display

# Télécharger les stopwords si nécessaire
nltk.download('stopwords')

# Définir les stopwords en français et ajouter des mots supplémentaires
sw = stopwords.words("french")
sw += ["les", "plus", "cette", "fait", "faire", "être", "deux", "comme", "dont", "tout", "ils", "bien", "sans", "peut", "tous", "après", "ainsi",
        "donc", "cet", "sous", "celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non", "faut", "trois", "aussi", "dit", "avoir",
          "doit", "contre", "depuis", "autres", "van", "het", "autre", "jusqu", "ville", "rossel", "dem", "etc", "mod", "bel", "ruo", "adr", "ecr", "aveo", 
          "app", "gar", "bx", "dos", "wai", "pet", "dés", "fer", "sal", "com", "quelques", "rien", "dès", "première", "puis", "chef", "cas", "car", "cinq", 
          "sujet", "pris", "hier", "fin", "elles", "effet", "declare", "ici", "voici", "celui", "trop", "doivent", "suite", "matin", "soir", "hier", "tres",
            "fin", "chez", "bon", "fois", "lieu", "quatre", "jours", "demande", "beaucoup", "dejà", "bureau", "paix", "tant", "maintenat", "aucune", "grands", 
            "avant", "point", "grandes", "parce", "prendre", "alors", "heure", "vient", "dejà", "afin", "vers", "mis", "mardi", "pourrait", "services", "hui"
              "souvent" ,"peut-être","vraiment","peu","bien","très","beaucoup","tout","toute","toutes","tous","plusieurs","certains","certaine","certaines",
                "divers","diverse","plus","moins","assez","tellement","hier","aujourd'hui","demain","jamais","souvent","parfois","rarement","quelquefois",
                "actuellement","déjà","encore","enfin","ainsi","alors","avant","après","ensuite","envers","contre", "nouveau", "nouvelle", "premier", 
                "première", "grand", "grande", "plusieurs", "certains", "certaines", "beaucoup", "tout", "toute", "toutes", "tous",
    "autre", "autres", "même", "peu", "très", "moins", "assez", "tellement", "plus", "moins", "ainsi", "alors", "encore", "jamais", "toujours", "souvent",
      "parfois", "rarement", "quelquefois", "peut-être", "peut", "bien", "aussi", "déjà", "encore", "seulement", "simplement", "probablement", 
      "actuellement","particulièrement", "notamment", "surtout"]

sw = set(sw)



# Fonction pour nettoyer le texte
def clean_text(year, folder=None):
    if folder is None:
        input_path = f"{year}.txt"
        output_path = f"{year}_clean.txt"
    else:
        input_path = os.path.join(folder, f"{year}.txt")
        output_path = os.path.join(folder, f"{year}_clean.txt")
    
    with open(input_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.upper() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)

    with open(output_path, "w", encoding='utf-8') as output:
        output.write(kept_string)

    return f'Output has been written in {output_path}!'

# Choisir les années
years = range(1945, 1956)

# Chemin des données et du dossier temporaire
txt_path = '../data/txteconomique'
temp_path = '../data/tmp'
if not os.path.exists(temp_path):
    os.mkdir(temp_path)

# Boucle sur chaque année
for year in years:
    # Lister les fichiers de cette année
    txts = [f for f in os.listdir(txt_path) if os.path.isfile(os.path.join(txt_path, f)) and str(year) in f]

    # Stocker le contenu de ces fichiers dans une liste
    content_list = []
    for txt in txts:
        with open(os.path.join(txt_path, txt), 'r', encoding='utf-8') as file:
            content_list.append(file.read())

    # Écrire tout le contenu dans un fichier temporaire
    with open(os.path.join(temp_path, f'{year}.txt'), 'w', encoding='utf-8') as file:
        file.write(' '.join(content_list))

    # Nettoyer le texte
    clean_text(year, folder=temp_path)

    # Lire le fichier nettoyé
    with open(os.path.join(temp_path, f'{year}_clean.txt'), 'r', encoding='utf-8') as file:
        after = file.read()

    # Générer le nuage de mots
    frequencies = Counter(after.split())
    cloud = WordCloud(width=2000, height=1000, background_color='white').generate_from_frequencies(frequencies)
    image_path = os.path.join(temp_path, f"{year}.png")
    cloud.to_file(image_path)

    # Afficher le nuage de mots
    display(Image(filename=image_path))
