In [13]:
import pandas as pd
import phonenumbers
from phonenumbers import geocoder
import re

def extract_french_like_numbers(text):
    if pd.isna(text):
        return []

    cleaned = re.sub(r'[^\d\+]', ' ', str(text))
    blocks = re.findall(r'\d{1,}', cleaned)

    candidates = []
    i = 0
    while i < len(blocks):
        group = blocks[i]
        j = i + 1
        while len(re.sub(r'\D', '', group)) < 9 and j < len(blocks):
            group += blocks[j]
            j += 1
        digits = re.sub(r'\D', '', group)
        if 9 <= len(digits) <= 10:
            candidates.append(digits)
            i = j
        else:
            i += 1

    normalized = []
    for num in candidates:
        phone = re.sub(r'^(\+33|0033)', '0', num)
        if len(phone) == 9:
            phone = '0' + phone
        if re.fullmatch(r'0[1-9]\d{8}', phone):
            normalized.append(phone)

    return normalized

def analyze_numbers(text, default_region="FR"):
    raw_numbers = extract_french_like_numbers(text)
    valid_numbers = []
    countries = []

    for raw in raw_numbers:
        try:
            parsed = phonenumbers.parse(raw, default_region)
            if phonenumbers.is_valid_number(parsed):
                e164 = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
                country = geocoder.description_for_number(parsed, "fr")
                valid_numbers.append(e164)
                countries.append(country if country else "Inconnu")
        except:
            continue

    if not valid_numbers:
        return [], "incorrect", "Incorrect"
    elif all(c == "France" for c in countries):
        return valid_numbers, "valide_fr", countries
    elif all(c == "Inconnu" for c in countries):
        return valid_numbers, "incorrect", "Incorrect"
    else:
        return valid_numbers, "valide_international", countries

# Charger le fichier CSV
fichier = r"C:/Users/AED-BBR/AED EXPERTISES/CLOUD - GENERAL/BASE DIGITALE/INFORMATIQUE/00 - ELH-BBR/Extraction_Sofia_numéro_tel/SOFIA - Locataires - Téls non interprétables.csv"
df = pd.read_csv(fichier, encoding="utf-8", sep=None, engine="python")

# Application sur la colonne 'Erreur'
numeros_extraits, statut_numeros, pays = [], [], []

for text in df['Erreur']:
    nums, statut, country = analyze_numbers(text)
    numeros_extraits.append(nums)
    statut_numeros.append(statut)
    pays.append(country)

# Ajout des colonnes
df["numeros_extraits"] = numeros_extraits
df["statut_numeros"] = statut_numeros
df["pays"] = pays

# Sauvegarde
df.to_csv("numeros_telephones_analyses.csv", index=False)
print("✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'")

✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'


In [12]:
import pandas as pd
import re

# Charger le fichier
fichier = r"C:/Users/AED-BBR/AED EXPERTISES/CLOUD - GENERAL/BASE DIGITALE/INFORMATIQUE/00 - ELH-BBR/Extraction_Sofia_numéro_tel/SOFIA - Locataires - Téls non interprétables.csv"
df = pd.read_csv(fichier, encoding="utf-8", sep=None, engine="python")

# Dictionnaire d’indicatifs pays
indicatifs_pays = {
    '33': 'France', '32': 'Belgique', '41': 'Suisse',
    '49': 'Allemagne', '34': 'Espagne', '39': 'Italie',
    '212': 'Maroc', '213': 'Algérie', '216': 'Tunisie'
}

# 🔹 Supprimer les doublons en conservant l'ordre
def remove_duplicates_preserve_order(lst):
    seen = set()
    result = []
    for item in lst:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

# 🔹 Extraction des numéros (ta version)
def extract_french_like_numbers(text):
    if pd.isna(text):
        return []

    cleaned = re.sub(r'[^\d\+]', ' ', str(text))
    blocks = re.findall(r'\d{1,}', cleaned)

    candidates = []
    i = 0
    while i < len(blocks):
        group = blocks[i]
        j = i + 1
        while len(re.sub(r'\D', '', group)) < 9 and j < len(blocks):
            group += blocks[j]
            j += 1
        digits = re.sub(r'\D', '', group)
        if 9 <= len(digits) <= 10:
            candidates.append(digits)
            i = j
        else:
            i += 1

    normalized = []
    for num in candidates:
        phone = re.sub(r'^(\+33|0033)', '0', num)
        if len(phone) == 9:
            phone = '0' + phone
        if re.fullmatch(r'0[1-9]\d{8}', phone):
            normalized.append(phone)

    return normalized

# 🔹 Nettoyage et détection du pays
def clean_and_identify(number):
    cleaned = re.sub(r'[^\d+]', '', number)

    if cleaned.startswith('00'):
        cleaned = '+' + cleaned[2:]
    elif cleaned.startswith('0'):
        cleaned = '+33' + cleaned[1:]
    elif not cleaned.startswith('+'):
        cleaned = '+' + cleaned

    # Identification du pays
    for i in range(1, 5):
        code = cleaned[1:1+i]
        if code in indicatifs_pays:
            return cleaned, indicatifs_pays[code]

    # Si numéro à 9 chiffres → France
    digits_only = re.sub(r'[^\d]', '', cleaned)
    if len(digits_only) == 9:
        cleaned = '+33' + digits_only
        return cleaned, 'France'

    return cleaned, "Inconnu"

# 🔹 Analyse
extracted_numbers, status_list, country_list = [], [], []

for text in df['Erreur']:
    raw_numbers = extract_french_like_numbers(text)
    cleaned_numbers, countries = [], []
    for num in raw_numbers:
        cleaned, country = clean_and_identify(num)
        cleaned_numbers.append(cleaned)
        countries.append(country)

    # Supprimer les doublons dans chaque cellule
    cleaned_numbers = remove_duplicates_preserve_order(cleaned_numbers)
    countries = remove_duplicates_preserve_order(countries)

    if not cleaned_numbers:
        status = "incorrect"
        countries = "Incorrect"
    elif all(c == "France" for c in countries):
        status = "valide_fr"
    elif any(c != "France" and c != "Inconnu" for c in countries):
        status = "valide_international"
    else:
        status = "valide_international"

    if status == "valide_international" and all(c == "Inconnu" for c in countries):
        status = "incorrect"
        countries = "Incorrect"

    extracted_numbers.append(cleaned_numbers)
    status_list.append(status)
    country_list.append(countries)

# 🔹 Ajouter les colonnes
df["numeros_extraits"] = extracted_numbers
df["statut_numeros"] = status_list
df["pays"] = country_list

# 🔹 Sauvegarder
df.to_csv("numeros_telephones_analyses.csv", index=False)
print("✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'")


✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'


In [11]:
import pandas as pd
import re

# 🔹 Charger le fichier
fichier = r"C:/Users/AED-BBR/AED EXPERTISES/CLOUD - GENERAL/BASE DIGITALE/INFORMATIQUE/00 - ELH-BBR/Extraction_Sofia_numéro_tel/SOFIA - Locataires - Téls non interprétables.csv"
df = pd.read_csv(fichier, encoding="utf-8", sep=None, engine="python")

# 🔹 Dictionnaire d’indicatifs pays
indicatifs_pays = {
    '33': 'France', '32': 'Belgique', '41': 'Suisse',
    '49': 'Allemagne', '34': 'Espagne', '39': 'Italie',
    '351': 'Portugal', '352': 'Luxembourg',
    '212': 'Maroc', '213': 'Algérie', '216': 'Tunisie'
}

# 🔹 Supprimer les doublons en conservant l'ordre
def remove_duplicates_preserve_order(lst):
    seen = set()
    result = []
    for item in lst:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

# 🔹 Étape 1 : Extraction des numéros français uniquement
def extract_french_numbers(text):
    if pd.isna(text):
        return []

    cleaned = re.sub(r'[^\d\+]', ' ', str(text))
    blocks = re.findall(r'\d{1,}', cleaned)

    candidates = []
    i = 0
    while i < len(blocks):
        group = blocks[i]
        j = i + 1
        while len(re.sub(r'\D', '', group)) < 9 and j < len(blocks):
            group += blocks[j]
            j += 1
        digits = re.sub(r'\D', '', group)
        if 9 <= len(digits) <= 10:
            candidates.append(digits)
            i = j
        else:
            i += 1

    normalized = []
    for num in candidates:
        phone = re.sub(r'^(\+33|0033)', '0', num)
        if len(phone) == 9:
            phone = '0' + phone
        if re.fullmatch(r'0[1-9]\d{8}', phone):
            normalized.append('+33' + phone[1:])

    return normalized

# 🔹 Étape 2 : Extraction des numéros internationaux
def extract_international_numbers(text):
    if pd.isna(text):
        return []

    cleaned = re.sub(r'[^\d\+]', ' ', str(text))
    blocks = re.findall(r'\d{1,}', cleaned)

    candidates = []
    i = 0
    while i < len(blocks):
        group = blocks[i]
        j = i + 1
        while len(re.sub(r'\D', '', group)) < 9 and j < len(blocks):
            group += blocks[j]
            j += 1
        digits = re.sub(r'\D', '', group)
        if 9 <= len(digits) <= 15:
            candidates.append(digits)
            i = j
        else:
            i += 1

    normalized = []
    for num in candidates:
        phone = num
        if phone.startswith('00'):
            phone = '+' + phone[2:]
        elif not phone.startswith('+'):
            phone = '+' + phone
        if re.fullmatch(r'\+\d{9,15}', phone):
            normalized.append(phone)

    return normalized

# 🔹 Détection du pays
def identify_country(number):
    for i in range(1, 5):
        code = number[1:1+i]
        if code in indicatifs_pays:
            return indicatifs_pays[code]
    return "Inconnu"

# 🔹 Étape 1 : Analyse initiale (numéros français uniquement)
extracted_numbers, status_list, country_list = [], [], []

for text in df['Erreur']:
    raw_numbers = extract_french_numbers(text)
    raw_numbers = remove_duplicates_preserve_order(raw_numbers)
    countries = ['France'] * len(raw_numbers) if raw_numbers else []

    if not raw_numbers:
        status = "incorrect"
        countries = "Incorrect"
    else:
        status = "valide_fr"

    extracted_numbers.append(raw_numbers)
    status_list.append(status)
    country_list.append(countries)

# 🔹 Étape 2 : Réanalyse des lignes incorrectes pour détecter les numéros internationaux
for i in range(len(df)):
    if status_list[i] == "incorrect":
        text = df.loc[i, 'Erreur']
        raw_numbers = extract_international_numbers(text)
        raw_numbers = remove_duplicates_preserve_order(raw_numbers)
        countries = [identify_country(num) for num in raw_numbers]
        countries = remove_duplicates_preserve_order(countries)

        if raw_numbers and any(c != "Inconnu" for c in countries):
            extracted_numbers[i] = raw_numbers
            country_list[i] = countries
            status_list[i] = "valide_international"
        elif raw_numbers:
            extracted_numbers[i] = raw_numbers
            country_list[i] = "Incorrect"
            status_list[i] = "incorrect"

# 🔹 Ajouter les colonnes
df["numeros_extraits"] = extracted_numbers
df["statut_numeros"] = status_list
df["pays"] = country_list

# 🔹 Sauvegarder
df.to_csv("numeros_telephones_analyses.csv", index=False)
print("✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'")


✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'
