# **Traitement_données**

In [1]:
import re
import math
import collections
import os
import docx
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
import joblib

In [2]:
def read_documents(folder_path, language):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            doc_path = os.path.join(folder_path, filename)
            doc = docx.Document(doc_path)
            text = " ".join([para.text for para in doc.paragraphs])
            texts.append((filename, text, language))  
    return texts

In [3]:
def clean_text(text):
    text = text.lower()  # Normalisation en minuscules
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Suppression des chiffres et de la ponctuation
    text = text.strip()
    return text

In [4]:
def letter_frequencies(text):
    text = text.replace(" ", "")
    total = len(text)
    if total == 0:
        return {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}  
    letter_counts = Counter(text)
    return {letter: count / total for letter, count in letter_counts.items()}

In [5]:
def entropy(text):
    freqs = letter_frequencies(text)  
    return -sum(freq * math.log2(freq) for freq in freqs.values() if freq > 0)

In [6]:
def count_stopwords(text, stopwords_list):
    words = text.split()
    stopword_counts = {stopword: 0 for stopword in stopwords_list}  
    for word in words:
        if word in stopwords_list:
            stopword_counts[word] += 1  
    return stopword_counts

In [7]:
def analyze_documents(texts, stopwords_english, stopwords_french, stopwords_italian, stopwords_spanish):
    results = []
    for filename, text, language in texts:  
        cleaned_text = clean_text(text)
        ent = entropy(cleaned_text)
        freqs = letter_frequencies(cleaned_text)

        # Compute stopword frequencies
        english_stopwords_count = count_stopwords(cleaned_text, stopwords_english)
        french_stopwords_count = count_stopwords(cleaned_text, stopwords_french)
        italian_stopwords_count = count_stopwords(cleaned_text, stopwords_italian)
        spanish_stopwords_count = count_stopwords(cleaned_text, stopwords_spanish)

        # Store results
        result = {
            "Filename": filename,
            "Language": language,  
            "Entropy": ent
        }

        # Add letter frequencies
        result.update({letter: freqs.get(letter, 0) for letter in "abcdefghijklmnopqrstuvwxyz"})

        # Add stopword frequencies
        result.update({"English_Stopwords_" + word: english_stopwords_count.get(word, 0) for word in stopwords_english})
        result.update({"French_Stopwords_" + word: french_stopwords_count.get(word, 0) for word in stopwords_french})
        result.update({"italian_Stopwords_" + word: italian_stopwords_count.get(word, 0) for word in stopwords_italian})
        result.update({"spanish_Stopwords_" + word: spanish_stopwords_count.get(word, 0) for word in stopwords_spanish})
        

        results.append(result)
    return results

In [8]:
# Liste des stopwords
stopwords_english = [
    "and", "are", "be", "but",
    "for", "from", "have", "in", "is", "it", "no", 
    "not", "of", "that", "the","this", "to", "with"
]

stopwords_french = [
    "au", "avec", "ce", "de", "des", "elle", 
    "et", "est", "il", "je", "la", "le", "les", 
    "mais", "ou", "par", "pas", "plus", "pour", "que", "qui", "si", 
    "sur", "un", "une"
]

stopwords_italian =  [
    "e", "di", "il", "la", "lo", "i", "gli", "le", 
    "che", "in", "vi", "si", "per", "ma", 
    "con", "un", "una", "su", "non", "questo", "quello"
]

stopwords_spanish = [
    "y", "de", "la", "las", "el", "los", "que", "a", "en", "no",
    "un", "unos", "es", "con", "por", "para", "del", "se", "lo",
    "pero", "como", "su", "al", "me", "le", "tener", "sin"
]


english_texts = read_documents(r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-ang","english")
french_texts = read_documents(r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-fr","french")
italian_texts = read_documents(r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-ital", "italian")
spanish_texts = read_documents(r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-espa", "spanish")

data = (analyze_documents(english_texts, stopwords_english, stopwords_french, stopwords_italian, stopwords_spanish) +
    analyze_documents(french_texts, stopwords_english, stopwords_french, stopwords_italian, stopwords_spanish) +
    analyze_documents(italian_texts, stopwords_english, stopwords_french, stopwords_italian, stopwords_spanish) +
    analyze_documents(spanish_texts, stopwords_english, stopwords_french, stopwords_italian, stopwords_spanish))

df = pd.DataFrame(data)
df

Unnamed: 0,Filename,Language,Entropy,a,b,c,d,e,f,g,...,spanish_Stopwords_se,spanish_Stopwords_lo,spanish_Stopwords_pero,spanish_Stopwords_como,spanish_Stopwords_su,spanish_Stopwords_al,spanish_Stopwords_me,spanish_Stopwords_le,spanish_Stopwords_tener,spanish_Stopwords_sin
0,001.docx,english,4.206203,0.081529,0.016306,0.027551,0.030925,0.122294,0.020242,0.026427,...,0,0,0,0,0,0,0,0,0,0
1,002.docx,english,4.153032,0.085165,0.018773,0.036630,0.036630,0.131868,0.012821,0.017399,...,0,0,0,0,0,0,0,0,0,0
2,003.docx,english,4.183186,0.078103,0.016770,0.034020,0.036895,0.120747,0.020125,0.020604,...,0,0,0,0,0,0,0,0,0,0
3,004.docx,english,4.146091,0.093236,0.014625,0.025594,0.047166,0.145521,0.025229,0.017550,...,0,0,0,0,0,0,0,0,0,0
4,005.docx,english,4.125936,0.098566,0.011769,0.032733,0.049651,0.142700,0.020596,0.018757,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,spn47.docx,spanish,3.998068,0.142857,0.020344,0.037975,0.067360,0.117089,0.005877,0.009946,...,7,1,0,1,4,3,0,1,0,1
196,spn49.docx,spanish,4.037988,0.125159,0.019092,0.042003,0.056003,0.119219,0.005091,0.014425,...,8,2,1,4,5,2,0,0,0,5
197,spn5.docx,spanish,4.021242,0.125166,0.014153,0.041575,0.060593,0.118532,0.003981,0.009730,...,7,0,0,0,6,2,0,2,0,2
198,spn7.docx,spanish,4.041274,0.130310,0.015274,0.041050,0.049642,0.117422,0.006205,0.010979,...,5,9,2,3,7,4,0,7,0,0


In [9]:
import pandas as pd

df["Language"] = df["Language"].map({"english": 1, "french": 0,"italian":2,"spanish":3})
df

Unnamed: 0,Filename,Language,Entropy,a,b,c,d,e,f,g,...,spanish_Stopwords_se,spanish_Stopwords_lo,spanish_Stopwords_pero,spanish_Stopwords_como,spanish_Stopwords_su,spanish_Stopwords_al,spanish_Stopwords_me,spanish_Stopwords_le,spanish_Stopwords_tener,spanish_Stopwords_sin
0,001.docx,1,4.206203,0.081529,0.016306,0.027551,0.030925,0.122294,0.020242,0.026427,...,0,0,0,0,0,0,0,0,0,0
1,002.docx,1,4.153032,0.085165,0.018773,0.036630,0.036630,0.131868,0.012821,0.017399,...,0,0,0,0,0,0,0,0,0,0
2,003.docx,1,4.183186,0.078103,0.016770,0.034020,0.036895,0.120747,0.020125,0.020604,...,0,0,0,0,0,0,0,0,0,0
3,004.docx,1,4.146091,0.093236,0.014625,0.025594,0.047166,0.145521,0.025229,0.017550,...,0,0,0,0,0,0,0,0,0,0
4,005.docx,1,4.125936,0.098566,0.011769,0.032733,0.049651,0.142700,0.020596,0.018757,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,spn47.docx,3,3.998068,0.142857,0.020344,0.037975,0.067360,0.117089,0.005877,0.009946,...,7,1,0,1,4,3,0,1,0,1
196,spn49.docx,3,4.037988,0.125159,0.019092,0.042003,0.056003,0.119219,0.005091,0.014425,...,8,2,1,4,5,2,0,0,0,5
197,spn5.docx,3,4.021242,0.125166,0.014153,0.041575,0.060593,0.118532,0.003981,0.009730,...,7,0,0,0,6,2,0,2,0,2
198,spn7.docx,3,4.041274,0.130310,0.015274,0.041050,0.049642,0.117422,0.006205,0.010979,...,5,9,2,3,7,4,0,7,0,0


# **Approche_Entropy**

In [11]:
import math
import os
import re
from collections import Counter
from docx import Document

def load_text_from_docx(file_path):
    """Charger le texte depuis un fichier Word."""
    doc = Document(file_path)
    full_text = [para.text for para in doc.paragraphs]
    return ' '.join(full_text)

def clean_text(text):
    """Normaliser le texte en minuscules et supprimer chiffres/ponctuation."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

def calculate_entropy(text):
    """Calculer l'entropie d'un texte selon la formule de Shannon."""
    text = clean_text(text)
    char_counts = Counter(text)
    total_chars = len(text)
    if total_chars == 0:
        return 0
    probabilities = [count / total_chars for count in char_counts.values()]
    return -sum(p * math.log2(p) for p in probabilities if p > 0)

def compute_average_entropy(folder_path):
    """
    Parcourt tous les fichiers DOCX dans le dossier,
    calcule l'entropie de chaque document et retourne l'entropie moyenne.
    """
    entropies = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            text = load_text_from_docx(file_path)
            entropies.append(calculate_entropy(text))
    if len(entropies) == 0:
        return 0
    return sum(entropies) / len(entropies)

# --- Calcul des entropies moyennes pour chaque langue ---
folder_path_en = r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-ang"
folder_path_fr = r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-fr"
folder_path_ital = r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-ital"
folder_path_spa = r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-espa"


average_entropy_en = compute_average_entropy(folder_path_en)
average_entropy_fr = compute_average_entropy(folder_path_fr)
average_entropy_ital = compute_average_entropy(folder_path_ital)
average_entropy_spa = compute_average_entropy(folder_path_spa)

print("Entropie moyenne (Anglais) :", average_entropy_en)
print("Entropie moyenne (Français) :", average_entropy_fr)
print("Entropie moyenne (Italian) :", average_entropy_ital)
print("Entropie moyenne (Spanish) :", average_entropy_spa)

def predict_language(document_path, avg_entropies):

    text = load_text_from_docx(document_path)
    doc_entropy = calculate_entropy(text)
    # On choisit la langue dont l'entropie moyenne est la plus proche de celle du document
    predicted_lang = min(avg_entropies, key=lambda lang: abs(avg_entropies[lang] - doc_entropy))
    return predicted_lang

# Dictionnaire des entropies moyennes par langue
avg_entropies = {
    'en': average_entropy_en,
    'fr': average_entropy_fr,
    'ital': average_entropy_ital,
    'espa': average_entropy_spa
}

# --- Prédiction sur un nouveau document ---
test_document_path = r"C:\Users\Lenovo\Desktop\projet theorie info\Train\data\data-ang\003.docx"  

predicted_language = predict_language(test_document_path, avg_entropies)
print(f"La langue prédite pour le document est : {predicted_language}")

Entropie moyenne (Anglais) : 4.118724018101505
Entropie moyenne (Français) : 4.005512462639387
Entropie moyenne (Italian) : 3.9455749374147286
Entropie moyenne (Spanish) : 3.9688806221404334
La langue prédite pour le document est : en


# **Approche_ML**

In [12]:
columns_to_normalize = ['a', 'b', 'c', 'd', 'e', 'f', 'g', "h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",
     "English_Stopwords_and" ,"English_Stopwords_are", "English_Stopwords_be", "English_Stopwords_but","English_Stopwords_for","English_Stopwords_from",
    "English_Stopwords_have","English_Stopwords_in", "English_Stopwords_is", "English_Stopwords_it","English_Stopwords_no","English_Stopwords_not",
    "English_Stopwords_of", "English_Stopwords_that", "English_Stopwords_the", "English_Stopwords_this","English_Stopwords_to","English_Stopwords_with", 
    "French_Stopwords_au","French_Stopwords_avec","French_Stopwords_ce","French_Stopwords_de", "French_Stopwords_des", "French_Stopwords_elle",
    "French_Stopwords_et","French_Stopwords_est","French_Stopwords_il","French_Stopwords_je", "French_Stopwords_la", "French_Stopwords_le", "French_Stopwords_les",
    "French_Stopwords_mais", "French_Stopwords_ou","French_Stopwords_par","French_Stopwords_pas",  "French_Stopwords_plus", "French_Stopwords_pour",
    "French_Stopwords_que","French_Stopwords_qui", "French_Stopwords_si", "French_Stopwords_sur", "French_Stopwords_un", "French_Stopwords_une", "italian_Stopwords_e", "italian_Stopwords_di", 
    "italian_Stopwords_il",  "italian_Stopwords_la", "italian_Stopwords_lo", "italian_Stopwords_i", "italian_Stopwords_gli", "italian_Stopwords_le", "italian_Stopwords_che", 
    "italian_Stopwords_in",  "italian_Stopwords_vi", "italian_Stopwords_si", "italian_Stopwords_per", "italian_Stopwords_ma", 
    "italian_Stopwords_con", "italian_Stopwords_un", "italian_Stopwords_una", "italian_Stopwords_su", "italian_Stopwords_non", "italian_Stopwords_questo", 
    "italian_Stopwords_quello","spanish_Stopwords_y", "spanish_Stopwords_de", "spanish_Stopwords_la", "spanish_Stopwords_las",
    "spanish_Stopwords_el", "spanish_Stopwords_los", "spanish_Stopwords_que", "spanish_Stopwords_a",
    "spanish_Stopwords_en", "spanish_Stopwords_no", "spanish_Stopwords_un", "spanish_Stopwords_unos",
    "spanish_Stopwords_es", "spanish_Stopwords_con", "spanish_Stopwords_por", "spanish_Stopwords_para",
    "spanish_Stopwords_del", "spanish_Stopwords_se", "spanish_Stopwords_lo", "spanish_Stopwords_pero",
    "spanish_Stopwords_como", "spanish_Stopwords_su", "spanish_Stopwords_al", "spanish_Stopwords_me",
    "spanish_Stopwords_le", "spanish_Stopwords_tener", "spanish_Stopwords_sin","Entropy"
]

# Normalize each column by dividing by its total sum
for col in columns_to_normalize:
    df[col] = df[col] / df[col].sum()

df

Unnamed: 0,Filename,Language,Entropy,a,b,c,d,e,f,g,...,spanish_Stopwords_se,spanish_Stopwords_lo,spanish_Stopwords_pero,spanish_Stopwords_como,spanish_Stopwords_su,spanish_Stopwords_al,spanish_Stopwords_me,spanish_Stopwords_le,spanish_Stopwords_tener,spanish_Stopwords_sin
0,001.docx,1,0.005198,0.004080,0.006471,0.003456,0.003526,0.004673,0.007906,0.008159,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
1,002.docx,1,0.005132,0.004262,0.007450,0.004594,0.004176,0.005039,0.005007,0.005372,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
2,003.docx,1,0.005170,0.003909,0.006655,0.004267,0.004206,0.004614,0.007860,0.006361,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
3,004.docx,1,0.005124,0.004666,0.005804,0.003210,0.005377,0.005561,0.009854,0.005418,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
4,005.docx,1,0.005099,0.004933,0.004671,0.004106,0.005660,0.005453,0.008044,0.005791,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,spn47.docx,3,0.004941,0.007149,0.008074,0.004763,0.007679,0.004474,0.002295,0.003071,...,0.011667,0.004149,0.000000,0.010638,0.012048,0.008287,0.0,0.000785,0.0,0.008929
196,spn49.docx,3,0.004990,0.006263,0.007577,0.005268,0.006384,0.004556,0.001989,0.004453,...,0.013333,0.008299,0.008850,0.042553,0.015060,0.005525,0.0,0.000000,0.0,0.044643
197,spn5.docx,3,0.004970,0.006264,0.005617,0.005215,0.006908,0.004529,0.001555,0.003004,...,0.011667,0.000000,0.000000,0.000000,0.018072,0.005525,0.0,0.001570,0.0,0.017857
198,spn7.docx,3,0.004994,0.006521,0.006062,0.005149,0.005659,0.004487,0.002424,0.003389,...,0.008333,0.037344,0.017699,0.031915,0.021084,0.011050,0.0,0.005495,0.0,0.000000


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = df.drop(columns=["Filename", "Language"])
y = df["Language"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [14]:
model1 = LogisticRegression(max_iter=1000, random_state=42)
model1.fit(X_train, y_train)


y_pred = model1.predict(X_test)

# Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Précision :", accuracy)
print("Rapport de classification :\n", report)

Précision : 0.9833333333333333
Rapport de classification :
               precision    recall  f1-score   support

           0       1.00      0.93      0.97        15
           1       1.00      1.00      1.00        15
           2       0.94      1.00      0.97        15
           3       1.00      1.00      1.00        15

    accuracy                           0.98        60
   macro avg       0.98      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60



In [15]:
y_pred

array([0, 0, 3, 3, 3, 2, 0, 3, 3, 1, 1, 2, 0, 1, 0, 1, 2, 0, 2, 2, 3, 0,
       1, 2, 0, 0, 0, 0, 1, 1, 2, 2, 1, 0, 2, 1, 3, 3, 2, 3, 1, 1, 0, 3,
       1, 2, 1, 2, 3, 2, 3, 3, 1, 1, 2, 2, 2, 0, 3, 3], dtype=int64)

In [16]:
joblib.dump(model1, "logistic_regression_model.pkl")
print("Modèle sauvegardé avec succès !")

Modèle sauvegardé avec succès !


In [17]:
from sklearn.svm import SVC

In [18]:
# Création et entraînement du modèle SVM
model3 = SVC(random_state=42)
model3.fit(X_train, y_train)

y_pred = model3.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Précision :", accuracy)
print("Rapport de classification :\n", report)

Précision : 0.9833333333333333
Rapport de classification :
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        15
           3       1.00      0.93      0.97        15

    accuracy                           0.98        60
   macro avg       0.98      0.98      0.98        60
weighted avg       0.98      0.98      0.98        60



In [19]:
joblib.dump(model3, "svm_model.pkl")
print("Modèle SVM sauvegardé avec succès !")

Modèle SVM sauvegardé avec succès !


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model4 = RandomForestClassifier(random_state=42)

# Fit the model on the training data
model4.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = model4.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

print("Précision du Random Forest :", accuracy_rf)
print("Rapport de classification du Random Forest:\n", report_rf)

Précision du Random Forest : 1.0
Rapport de classification du Random Forest:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        15
           3       1.00      1.00      1.00        15

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60



In [21]:
joblib.dump(model4, "random_forest_model.pkl")
print("Modèle Random Forest sauvegardé avec succès !")

Modèle Random Forest sauvegardé avec succès !
