In [8]:
pip install phonenumbers

Note: you may need to restart the kernel to use updated packages.


In [84]:
import pandas as pd
import phonenumbers
from phonenumbers import geocoder
import re

def extract_french_like_numbers(text):
    if pd.isna(text):
        return []

    cleaned = re.sub(r'[^\d\+]', ' ', str(text))
    blocks = re.findall(r'\d{1,}', cleaned)

    candidates = []
    i = 0
    while i < len(blocks):
        group = blocks[i]
        j = i + 1
        while len(re.sub(r'\D', '', group)) < 9 and j < len(blocks):
            group += blocks[j]
            j += 1
        digits = re.sub(r'\D', '', group)
        if 9 <= len(digits) <= 10:
            candidates.append(digits)
            i = j
        else:
            i += 1

    normalized = []
    for num in candidates:
        phone = re.sub(r'^(\+33|0033)', '0', num)
        if len(phone) == 9:
            phone = '0' + phone
        if re.fullmatch(r'0[1-9]\d{8}', phone):
            normalized.append(phone)

    return normalized

def analyze_numbers(text, default_region="FR"):
    raw_numbers = extract_french_like_numbers(text)
    valid_numbers = []
    countries = []

    for raw in raw_numbers:
        try:
            parsed = phonenumbers.parse(raw, default_region)
            if phonenumbers.is_valid_number(parsed):
                e164 = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
                country = geocoder.description_for_number(parsed, "fr")
                valid_numbers.append(e164)
                countries.append(country if country else "Inconnu")
        except:
            continue

    if not valid_numbers:
        return [], "incorrect", "Incorrect"
    elif all(c == "France" for c in countries):
        return valid_numbers, "valide_fr", countries
    elif all(c == "Inconnu" for c in countries):
        return valid_numbers, "incorrect", "Incorrect"
    else:
        return valid_numbers, "valide_international", countries

# Charger le fichier CSV
fichier = r"C:/Users/AED-BBR/AED EXPERTISES/CLOUD - GENERAL/BASE DIGITALE/INFORMATIQUE/00 - ELH-BBR/Extraction_Sofia_numéro_tel/SOFIA - Locataires - Téls non interprétables.csv"
df = pd.read_csv(fichier, encoding="utf-8", sep=None, engine="python")

# Application sur la colonne 'Erreur'
numeros_extraits, statut_numeros, pays = [], [], []

for text in df['Erreur']:
    nums, statut, country = analyze_numbers(text)
    numeros_extraits.append(nums)
    statut_numeros.append(statut)
    pays.append(country)

# Ajout des colonnes
df["numeros_extraits"] = numeros_extraits
df["statut_numeros"] = statut_numeros
df["pays"] = pays

# Sauvegarde
df.to_csv("numeros_telephones_analyses.csv", index=False)
print("✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'")



✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'


In [86]:
import pandas as pd
import phonenumbers
from phonenumbers import geocoder
import re
import html
from functools import lru_cache

# 🔹 Précompilation des expressions régulières
RE_SEQUENCES = re.compile(r'//|--')
RE_SPECIAL_CHARS = re.compile(r'[.,;:/\\\-–—_+&(){}\[\]<>!?=~\'"`|]')
RE_SPACES = re.compile(r'\s+')
RE_DIGITS = re.compile(r'\d+')
RE_VALID_PHONE = re.compile(r'0[1-9]\d{8}')

# 🔹 Nettoyage du texte
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).replace("A/-_A/-_", "")
    text = html.unescape(text)
    text = RE_SEQUENCES.sub(' ', text)
    text = RE_SPECIAL_CHARS.sub(' ', text)
    text = RE_SPACES.sub(' ', text)
    return text.strip()

# 🔹 Extraction des numéros ressemblant à des numéros français
def extract_french_like_numbers(text):
    text = clean_text(text)
    digit_groups = RE_DIGITS.findall(text)

    candidates = []
    i = 0
    while i < len(digit_groups):
        group = digit_groups[i]
        j = i + 1
        while len(group) < 9 and j < len(digit_groups):
            group += digit_groups[j]
            j += 1
        digits = re.sub(r'\D', '', group)
        if 9 <= len(digits) <= 12:
            candidates.append(digits)
            i = j
        else:
            i += 1
    return candidates

# 🔹 Mise en cache de la validation des numéros
@lru_cache(maxsize=10000)
def validate_number(phone, default_region="FR"):
    try:
        parsed = phonenumbers.parse(phone, default_region)
        if phonenumbers.is_valid_number(parsed):
            e164 = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
            country = geocoder.description_for_number(parsed, "fr")
            return e164, country if country else "Inconnu"
    except:
        pass
    return None, None  # ✅ Assure toujours un retour tuple

# 🔹 Analyse des numéros
def analyze_numbers(text, default_region="FR"):
    raw_numbers = extract_french_like_numbers(text)
    normalized = []

    for num in raw_numbers:
        phone = num
        if phone.startswith("00"):
            phone = phone[2:]
        if phone.startswith("330"):
            phone = "0" + phone[3:]
        elif phone.startswith("33"):
            phone = "0" + phone[2:]
        elif phone.startswith("3"):
            phone = "0" + phone[1:]
        elif phone.startswith("+33"):
            phone = "0" + phone[3:]
        elif len(phone) == 9:
            phone = "0" + phone

        if len(set(phone)) == 1:
            continue

        if RE_VALID_PHONE.fullmatch(phone):
            normalized.append(phone)

    valid_numbers = []
    countries = []

    for phone in normalized:
        e164, country = validate_number(phone, default_region)
        if e164:
            valid_numbers.append(e164)
            countries.append(country)

    if not valid_numbers:
        return [], "incorrect", "Incorrect"
    elif all(c == "France" for c in countries):
        return valid_numbers, "valide_fr", countries
    elif all(c == "Inconnu" for c in countries):
        return valid_numbers, "incorrect", "Incorrect"
    else:
        return valid_numbers, "valide_international", countries

# 🔹 Chargement du fichier CSV
fichier = r"C:/Users/AED-BBR/AED EXPERTISES/CLOUD - GENERAL/BASE DIGITALE/INFORMATIQUE/00 - ELH-BBR/Extraction_Sofia_numéro_tel/SOFIA - Locataires - Téls non interprétables.csv"
df = pd.read_csv(fichier, encoding="utf-8", sep=None, engine="python")

# 🔹 Application vectorisée avec apply
results = df["Erreur"].apply(analyze_numbers)
df["numeros_extraits"] = results.apply(lambda x: x[0])
df["statut_numeros"] = results.apply(lambda x: x[1])
df["pays"] = results.apply(lambda x: x[2])

# 🔹 Sauvegarde du fichier
df.to_csv("numeros_telephones_analyses.csv", index=False)
print("✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'")


✅ Résultat enregistré dans 'numeros_telephones_analyses.csv'
