In [1]:
# Importer les bibliothèques nécessaires
from lingua import Language, LanguageDetectorBuilder
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pandas as pd
import re


In [2]:
# Initialisation du détecteur de langue
detector = LanguageDetectorBuilder.from_languages(
    Language.FRENCH, 
    Language.ENGLISH
).with_preloaded_language_models().build()

In [3]:
def detect_language_lingua(text):
    result = detector.detect_language_of(text)
    return "fr" if result == Language.FRENCH else "en" if result == Language.ENGLISH else "unknown"

In [4]:
# Charger un modèle de traduction léger (Helsinki-NLP)
translator = pipeline(
    "translation_en_to_fr", 
    model="Helsinki-NLP/opus-mt-en-fr",
    device=-1  # CPU
)


Device set to use cpu


In [5]:
# Fonction pour nettoyer le texte
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\r", "").replace("\n", " ")  # Supprime les retours à la ligne
    text = re.sub(r'\s+', ' ', text).strip()  # Supprime les espaces multiples
    return text

In [28]:
# Fonction de traduction
def translate_corpus(texts, workers=4):
    def process_text(text):
        lang = detect_language_lingua(text)
        if lang == 'en':
            try:
                return translator(text, max_length=512)[0]['translation_text']
            except:
                return text  # En cas d'erreur, conserve l'original
        return text
    
    with ThreadPoolExecutor(max_workers=workers) as executor:
        return list(tqdm(executor.map(process_text, texts), total=len(texts)))

In [7]:
# Charger le fichier CSV
df = pd.read_csv('C:/Users/my.kassem/Desktop/exploratory data analysis/df_filtre_status_type_description_resolution_copy.csv')

In [8]:
df

Unnamed: 0,Key,Type,Summary,Description,Status,Resolution,Resolution Date,Release Note,Priority,Created,...,Components,Fix Versions,Parent Issue,Watchers,Issue Links,Subtasks,Comments,Module-Feature,Module,Feature
0,DHRD-103268,Incident,CACEIS-PROD-IN-CP - Licence expirée depuis le ...,"Bonjour à tous,\r\nNous avons CACEIS qui nous ...",Resolved,Fixed,2025-01-23T10:45:39.000+0100,,Blocker,2025-01-22T13:49:58.000+0100,...,Oth/other,,,1,,,[2025-01-22T13:58:13.000+0100] Hamid Ameziani:...,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,00 - Installation & Tech Admin,Installation - Run Components
1,DHRD-103216,Story,TST - Manual override / Audit must be retested,"Due to improvements done with DHRD-100447, th...",Done,Fixed,2025-01-24T10:36:04.000+0100,,Minor,2025-01-21T11:59:10.000+0100,...,,,,0,outward: depends on DHRD-100447 [Done] - Story...,,[2025-01-21T12:08:22.000+0100] Loic Brossard: ...,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,05 - Web Client,Web Navigation & Searching
2,DHRD-103195,Problem,Can't Start Workers,||Version details||Environment infos||\r\n| * ...,Resolved,Fixed,2025-01-22T16:24:24.000+0100,,Blocker,2025-01-20T17:23:57.000+0100,...,,,,0,inward: blocks DHRD-100501 [Blocked] - [DH-7.1...,,[2025-01-20T18:49:05.000+0100] Abir Arrari: Tr...,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,02 - System Admin,"Scheduler, Tasks & Monitoring"
3,DHRD-103180,Bug,[AWS] S3 services failure with AWS_ROLE channel,||Version details||Environment infos||\r\n| * ...,Done,Fixed,2025-01-22T14:36:37.000+0100,,Blocker,2025-01-20T16:06:24.000+0100,...,,"DH-7.1.0.0, DH-7.2.0.0",,0,inward: is created by DHRD-98328 [New] - [Clou...,,[2025-01-20T16:23:44.000+0100] Mukesh Mishra: ...,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,16 - Cloud & DevOps,AWS Deployment
4,DHRD-103113,Bug,[Regression] : NPE raised when using the Share...,||Version details||Environment infos||\r\n|VER...,Done,Fixed,2025-01-23T09:26:04.000+0100,,Blocker,2025-01-17T08:22:51.000+0100,...,,"DH-7.1.0.0, DH-7.2.0.0",,0,outward: includes DHRD-103220 [To be tested] -...,,[2025-01-20T16:01:08.000+0100] Chaid Fatoumbi:...,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,15 - Tools,Evaluator
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15976,DHRD-7282,Bug,Import error : deadlock failure,"When importing a file, the error ""* Main error...",Closed,Fixed,2012-02-02T04:00:00.000+0100,"When importing data, the multisource module ha...",,2012-02-02T04:00:00.000+0100,...,C/import,"DH-3.2-00b, DH-3.3-00",,0,outward: includes DHRD-21348 [Test failed] - [...,,,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,06 - Mapping & Connectivity,
15977,DHRD-7281,Improvement,Evolutions mineures sur l'environnement des bu...,Bizness rule:\no chercher une autre bizule en ...,Closed,Fixed,2012-02-02T03:56:00.000+0100,Some bugs have been fixed and various improvem...,,2012-02-02T03:56:00.000+0100,...,Cor/rule dict.,DH-3.3-00,,0,outward: includes DHRD-21347 [Validated] - [DH...,,,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,01 - Core Admin,
15978,DHRD-7280,Bug,L'icone des mails interne en attente est HS...,L'icone de boîte mail situé en bas de l'écran ...,Closed,Fixed,2012-02-02T03:53:00.000+0100,,,2012-02-02T03:53:00.000+0100,...,Gui/components,DH-3.3-00,,0,outward: includes DHRD-21346 [Validated] - [DH...,,,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,04 - Rich Client,
15979,DHRD-7279,Request,Ratings,Titres : \n - Les ratings côté titre sont faux...,Closed,Fixed,2012-02-01T04:13:00.000+0100,,,2012-02-01T04:13:00.000+0100,...,Oth/other,,,0,,,,{'self': 'https://nx-jira8.my-nx.com/rest/api/...,09 - Data Enrichment,


In [9]:
# Nettoyer les textes
df['Description_Clean'] = df['Description'].apply(clean_text)
df['Summary_Clean'] = df['Summary'].apply(clean_text)

In [10]:
# Détecter la langue dans le Summary
df['Detected_Language_Summary'] = df['Summary_Clean'].apply(detect_language_lingua)

In [11]:
# Filtrer les lignes où la langue détectée dans le Summary est 'fr' (français)
french_summary_df = df[df['Detected_Language_Summary'] == 'fr']


In [12]:
# Extraire les 10 premières lignes du Summary des tickets en français
french_summary_texts = french_summary_df['Summary'].head(10).tolist()

In [None]:
# Appliquer la traduction
translated_texts = translate_corpus(french_summary_texts)

100%|██████████| 10/10 [00:00<?, ?it/s]


In [31]:
translated_texts

['CACEIS-PROD-IN-CP - Licence expirée depuis le 22/12/2024',
 "Problème d'accès au client lourd DH à partir des environnement APAC aws",
 '[TBV] Import Excel simulation',
 '[TBF] Import csv simulation',
 'Json file Import',
 '[TBF] Json file Export',
 'DEKABANK-SIT-RQ-DH- Impossible de lancer visage pour faire une démo - urgent',
 'SGSS-SIT-PB-  [Migration V6] Problème de perf sur les import',
 'AMUNDI-PROD-PB KYC - Problème de rafraichissement après sauvegarde de dossier',
 "LBP-PROD-RQ- Demande d'archives sans  le binaire Log4J"]