In [9]:
import pandas as pd
import re

# Charger le fichier
fichier = r"C:/Users/AED-BBR/AED EXPERTISES/CLOUD - GENERAL/BASE DIGITALE/INFORMATIQUE/00 - ELH-BBR/Extraction_Sofia_numéro_tel/SOFIA - Locataires - Téls non interprétables.csv"
df = pd.read_csv(fichier, encoding="utf-8", sep=None, engine="python")

# Dictionnaire d’indicatifs pays (sans Royaume-Uni)
indicatifs_pays = {
    '33': 'France', '32': 'Belgique', '41': 'Suisse',
    '49': 'Allemagne', '34': 'Espagne', '39': 'Italie',
    '212': 'Maroc', '213': 'Algérie', '216': 'Tunisie'
}

# Extraction des numéros
def extract_phone_numbers(text):
    if pd.isna(text):
        return []
    text = re.sub(r'[^0-9+./\-\s]', ' ', str(text))
    found = re.findall(r'(?:\+?\d[\d\s./\-]{7,}\d)', text)
    numbers = []
    for f in found:
        digits_only = re.sub(r'[^\d]', '', f)
        blocs = re.findall(r'\d{9,10}', digits_only)
        numbers.extend(blocs)
    return numbers

# Nettoyage et détection pays
def clean_and_identify(number):
    cleaned = re.sub(r'[^\d+]', '', number)
    if cleaned.startswith('00'):
        cleaned = '+' + cleaned[2:]
    elif cleaned.startswith('0'):
        cleaned = '+33' + cleaned[1:]
    elif not cleaned.startswith('+'):
        cleaned = '+' + cleaned

    # Conversion Royaume-Uni → France
    if cleaned.startswith('+44'):
        possible_fr = '0' + cleaned[3:]
        if len(possible_fr) == 10:
            cleaned = '+33' + possible_fr[1:]

    # Identification du pays
    for i in range(1, 5):
        code = cleaned[1:1+i]
        if code in indicatifs_pays:
            return cleaned, indicatifs_pays[code]

    # Si numéro à 9 chiffres → France
    digits_only = re.sub(r'[^\d]', '', cleaned)
    if len(digits_only) == 9:
        cleaned = '+33' + digits_only
        return cleaned, 'France'

    return cleaned, "Inconnu"

# Analyse
extracted_numbers, status_list, country_list = [], [], []

for text in df['Erreur']:
    raw_numbers = extract_phone_numbers(text)
    cleaned_numbers, countries = [], []
    for num in raw_numbers:
        cleaned, country = clean_and_identify(num)
        cleaned_numbers.append(cleaned)
        countries.append(country)

    if not cleaned_numbers:
        status = "incorrect"
        countries = "Incorrect"
    elif all(c == "France" for c in countries):
        status = "valide_fr"
    elif any(c != "France" and c != "Inconnu" for c in countries):
        status = "valide_international"
    else:
        status = "valide_international"

    # Correction : si statut est valide_international mais tous les pays sont Inconnu
    if status == "valide_international" and all(c == "Inconnu" for c in countries):
        status = "incorrect"
        countries = "Incorrect"

    extracted_numbers.append(cleaned_numbers)
    status_list.append(status)
    country_list.append(countries)

# Ajouter les colonnes
df["numeros_extraits"] = extracted_numbers
df["statut_numeros"] = status_list
df["pays"] = country_list

# Sauvegarder
df.to_csv("numeros_telephones_analyses.csv", index=False)
print("✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'")


✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'
