In [1]:
from lingua import Language, LanguageDetectorBuilder

Détection de Langue avec Lingua

In [2]:
# Initialisation du détecteur (optimisé pour FR/EN)
detector = LanguageDetectorBuilder.from_languages(
    Language.FRENCH, 
    Language.ENGLISH
).with_preloaded_language_models().build()

In [None]:
def detect_language_lingua(text):
    result = detector.detect_language_of(text)
    return "fr" if result == Language.FRENCH else "en" if result == Language.ENGLISH else "unknown"

In [4]:
print(detect_language_lingua("Webstart with ssl and encrypted password in network.cfg"))

en


In [14]:
print(detect_language_lingua("Correction Erreurs pour CMAM - Préparation de la base de données"))

fr


Pipeline Complet avec Traduction

In [6]:
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


In [7]:
# Chargez un modèle de traduction léger
translator = pipeline(
    "translation_en_to_fr", 
    model="Helsinki-NLP/opus-mt-en-fr",
    device=-1  # CPU
)

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Device set to use cpu


In [9]:
def process_text(text):
    lang = detect_language_lingua(text)
    if lang == 'en':
        try:
            return translator(text, max_length=100)[0]['translation_text']
        except:
            return text  # En cas d'erreur, conserve l'original
    return text

In [10]:

# Traitement parallèle des 16k tickets
def translate_corpus(texts, workers=4):
    with ThreadPoolExecutor(max_workers=workers) as executor:
        return list(tqdm(executor.map(process_text, texts), total=len(texts)))


In [11]:
import pandas as pd

# Charger le fichier CSV
df = pd.read_csv('C:/Users/my.kassem/Desktop/exploratory data analysis/df_filtre_status_type_description_resolution_copy.csv')

In [12]:
summaries = df['Summary'].tolist()  # Convertit la colonne en liste



In [19]:
from collections import Counter

def detect_language_with_confidence(text):
    if not isinstance(text, str) or text.strip() == "":
        return "unknown"
    
    result = detector.compute_language_confidence_values(text)
    if result:
        best_match = max(result, key=lambda x: x.value)
        if best_match.value > 0.7:  # Seulement si la confiance est > 70%
            return "fr" if best_match.language == Language.FRENCH else "en" if best_match.language == Language.ENGLISH else "unknown"
    return "unknown"

# Détecter la langue pour chaque Description avec confiance
df['Detected_Language_Desc'] = df['Description'].apply(detect_language_with_confidence)

# Détecter la langue pour chaque Summary avec confiance
df['Detected_Language_Summary'] = df['Summary'].apply(detect_language_with_confidence)

# Compter l'apparition de chaque langue pour Description
language_counts_desc = Counter(df['Detected_Language_Desc'])

# Compter l'apparition de chaque langue pour Summary
language_counts_summary = Counter(df['Detected_Language_Summary'])

# Afficher les statistiques pour Description
print("Répartition des langues dans le champ Description :")
for lang, count in language_counts_desc.items():
    print(f"{lang}: {count} occurrences")

# Afficher les statistiques pour Summary
print("\nRépartition des langues dans le champ Summary :")
for lang, count in language_counts_summary.items():
    print(f"{lang}: {count} occurrences")

# Aperçu des résultats
df[['Description', 'Detected_Language_Desc', 'Summary', 'Detected_Language_Summary']].head(10)


Répartition des langues dans le champ Description :
en: 8988 occurrences
fr: 6626 occurrences
unknown: 367 occurrences

Répartition des langues dans le champ Summary :
fr: 4318 occurrences
en: 9964 occurrences
unknown: 1699 occurrences


Unnamed: 0,Description,Detected_Language_Desc,Summary,Detected_Language_Summary
0,"Bonjour à tous,\r\nNous avons CACEIS qui nous ...",en,CACEIS-PROD-IN-CP - Licence expirée depuis le ...,fr
1,"Due to improvements done with DHRD-100447, th...",en,TST - Manual override / Audit must be retested,en
2,||Version details||Environment infos||\r\n| * ...,en,Can't Start Workers,en
3,||Version details||Environment infos||\r\n| * ...,en,[AWS] S3 services failure with AWS_ROLE channel,en
4,||Version details||Environment infos||\r\n|VER...,en,[Regression] : NPE raised when using the Share...,en
5,When we got a version Mismatch when we try to ...,en,FIL-PROD-IN (IBOR) SaveObjectsImmediately remo...,en
6,basically bake push images as multiarch even t...,en,retention - fix multi-arch image deletion...,unknown
7,Since the url defined in the communication cha...,en,US11 - Constitution of the complete url,en
8,||Version details||Environment infos||\r\n| * ...,en,[nxdh-docker][k8s] Unable to connect to thick ...,en
9,Diego / Islem - Image manquante sur artifactor...,fr,retention - fix images being deleted when they...,en


In [21]:
#1. Vérifier et Nettoyer le Texte
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\r", "").replace("\n", " ")  # Supprime les retours à la ligne
    text = re.sub(r'\s+', ' ', text).strip()  # Supprime les espaces multiples
    return text

df['Description_Clean'] = df['Description'].apply(clean_text)
df['Summary_Clean'] = df['Summary'].apply(clean_text)


In [22]:
#2. Ajuster le Seuil de Confiance
def detect_language_with_adjusted_confidence(text, threshold=0.6):  # Ajusté à 60%
    if not isinstance(text, str) or text.strip() == "":
        return "unknown"

    result = detector.compute_language_confidence_values(text)
    if result:
        best_match = max(result, key=lambda x: x.value)
        if best_match.value > threshold:
            return "fr" if best_match.language == Language.FRENCH else "en" if best_match.language == Language.ENGLISH else "unknown"
    return "unknown"


In [23]:
def detect_language_with_mean_confidence(text):
    if not isinstance(text, str) or text.strip() == "":
        return "unknown"

    confidence_values = detector.compute_language_confidence_values(text)
    if not confidence_values:
        return "unknown"

    lang_conf = {str(l.language): l.value for l in confidence_values}
    fr_conf = lang_conf.get("Language.FRENCH", 0)
    en_conf = lang_conf.get("Language.ENGLISH", 0)

    # Comparaison en utilisant la moyenne
    if fr_conf >= en_conf and fr_conf > 0.6:
        return "fr"
    elif en_conf > fr_conf and en_conf > 0.6:
        return "en"
    return "unknown"


In [26]:
# Nettoyer les textes
df['Description_Clean'] = df['Description'].apply(clean_text)
df['Summary_Clean'] = df['Summary'].apply(clean_text)

# Détection améliorée des langues
df['Detected_Language_Desc'] = df['Description_Clean'].apply(detect_language_with_mean_confidence)
df['Detected_Language_Summary'] = df['Summary_Clean'].apply(detect_language_with_mean_confidence)

# Statistiques finales
from collections import Counter
language_counts_desc = Counter(df['Detected_Language_Desc'])
language_counts_summary = Counter(df['Detected_Language_Summary'])

# Affichage des statistiques
print("Répartition des langues dans le champ Description :")
for lang, count in language_counts_desc.items():
    print(f"{lang}: {count} occurrences")

print("\nRépartition des langues dans le champ Summary :")
for lang, count in language_counts_summary.items():
    print(f"{lang}: {count} occurrences")

# Aperçu des résultats avec texte nettoyé
df[['Description', 'Description_Clean', 'Detected_Language_Desc', 
    'Summary', 'Summary_Clean', 'Detected_Language_Summary']].head(10)


Répartition des langues dans le champ Description :
en: 9101 occurrences
fr: 6697 occurrences
unknown: 183 occurrences

Répartition des langues dans le champ Summary :
fr: 4696 occurrences
en: 10489 occurrences
unknown: 796 occurrences


Unnamed: 0,Description,Description_Clean,Detected_Language_Desc,Summary,Summary_Clean,Detected_Language_Summary
0,"Bonjour à tous,\r\nNous avons CACEIS qui nous ...","Bonjour à tous, Nous avons CACEIS qui nous rem...",en,CACEIS-PROD-IN-CP - Licence expirée depuis le ...,CACEIS-PROD-IN-CP - Licence expirée depuis le ...,fr
1,"Due to improvements done with DHRD-100447, th...","Due to improvements done with DHRD-100447, the...",en,TST - Manual override / Audit must be retested,TST - Manual override / Audit must be retested,en
2,||Version details||Environment infos||\r\n| * ...,||Version details||Environment infos|| | * *VE...,en,Can't Start Workers,Can't Start Workers,en
3,||Version details||Environment infos||\r\n| * ...,||Version details||Environment infos|| | * *VE...,en,[AWS] S3 services failure with AWS_ROLE channel,[AWS] S3 services failure with AWS_ROLE channel,en
4,||Version details||Environment infos||\r\n|VER...,||Version details||Environment infos|| |VERSIO...,en,[Regression] : NPE raised when using the Share...,[Regression] : NPE raised when using the Share...,en
5,When we got a version Mismatch when we try to ...,When we got a version Mismatch when we try to ...,en,FIL-PROD-IN (IBOR) SaveObjectsImmediately remo...,FIL-PROD-IN (IBOR) SaveObjectsImmediately remo...,en
6,basically bake push images as multiarch even t...,basically bake push images as multiarch even t...,en,retention - fix multi-arch image deletion...,retention - fix multi-arch image deletion...,en
7,Since the url defined in the communication cha...,Since the url defined in the communication cha...,en,US11 - Constitution of the complete url,US11 - Constitution of the complete url,en
8,||Version details||Environment infos||\r\n| * ...,||Version details||Environment infos|| | * *VE...,en,[nxdh-docker][k8s] Unable to connect to thick ...,[nxdh-docker][k8s] Unable to connect to thick ...,en
9,Diego / Islem - Image manquante sur artifactor...,Diego / Islem - Image manquante sur artifactor...,fr,retention - fix images being deleted when they...,retention - fix images being deleted when they...,en


In [30]:
# Fonction de traduction (en français -> anglais)
def translate_text(text):
    if text:
        try:
            return translator(text, max_length=100)[0]['translation_text']
        except Exception as e:
            print(f"Erreur de traduction: {e}")
            return text
    return text

In [31]:
# Appliquer la détection de la langue sur la colonne Summary et traduire uniquement si la langue est le français
def translate_summary(df):
    # Détecter la langue pour chaque ligne de Summary
    df['Detected_Language_Summary'] = df['Summary_Clean'].apply(detect_language_with_mean_confidence)
    
    # Traduire uniquement si la langue détectée est le français
    df['Translated_Summary'] = df.apply(
        lambda row: translate_text(row['Summary_Clean']) if row['Detected_Language_Summary'] == 'fr' else row['Summary_Clean'], axis=1
    )
    
    return df[['Summary', 'Detected_Language_Summary', 'Translated_Summary']]

In [33]:
# Filtrer les lignes où la langue détectée dans Summary est 'fr' (français)
french_summary_df = df[df['Detected_Language_Summary'] == 'fr']

# Afficher les 30 premières lignes du DataFrame filtré
print(french_summary_df[['Summary', 'Detected_Language_Summary']].head(30))


                                               Summary  \
0    CACEIS-PROD-IN-CP - Licence expirée depuis le ...   
15   Problème d'accès au client lourd DH à partir d...   
19                         [TBF] Import csv simulation   
46                                    Json file Import   
47                              [TBF] Json file Export   
53   DEKABANK-SIT-RQ-DH- Impossible de lancer visag...   
61   SGSS-SIT-PB-  [Migration V6] Problème de perf ...   
109  AMUNDI-PROD-PB KYC - Problème de rafraichissem...   
112  LBP-PROD-RQ- Demande d'archives sans  le binai...   
115        Besoin de la v. 6.5.1.1 dockerisée resigné    
119  CNP-SIT-PB- problème connexion ldap suite Migr...   
139  Régression FeedProcs doublons dans NXDH6.5 . A...   
157  Problèmes avec le packageInstallerService dans...   
177                   [BPSS/Manaos]Problème de licence   
181  LODH-SIT-PB- Gestion de la déconnexion de la b...   
185                        Client lourd ne s'ouvre pas   
204  Problème 

In [39]:
# Traduire les lignes filtrées où la langue est détectée comme français
french_summary_texts = french_summary_df['Summary'].tolist()

# Appliquer la traduction
translated_texts = translate_corpus(french_summary_texts)

# Ajouter les traductions dans une nouvelle colonne 'Translated_Summary'
french_summary_df['Translated_Summary'] = translated_texts

# Afficher les résultats
print(french_summary_df[['Summary', 'Translated_Summary']].head(30))


100%|██████████| 4696/4696 [00:00<00:00, 245224.75it/s]

                                               Summary  \
0    CACEIS-PROD-IN-CP - Licence expirée depuis le ...   
15   Problème d'accès au client lourd DH à partir d...   
19                         [TBF] Import csv simulation   
46                                    Json file Import   
47                              [TBF] Json file Export   
53   DEKABANK-SIT-RQ-DH- Impossible de lancer visag...   
61   SGSS-SIT-PB-  [Migration V6] Problème de perf ...   
109  AMUNDI-PROD-PB KYC - Problème de rafraichissem...   
112  LBP-PROD-RQ- Demande d'archives sans  le binai...   
115        Besoin de la v. 6.5.1.1 dockerisée resigné    
119  CNP-SIT-PB- problème connexion ldap suite Migr...   
139  Régression FeedProcs doublons dans NXDH6.5 . A...   
157  Problèmes avec le packageInstallerService dans...   
177                   [BPSS/Manaos]Problème de licence   
181  LODH-SIT-PB- Gestion de la déconnexion de la b...   
185                        Client lourd ne s'ouvre pas   
204  Problème 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  french_summary_df['Translated_Summary'] = translated_texts
