# Nettoyage du fichier de contact presse 

In [None]:
import re
import pandas as pd

from validate_email import validate_email
from config_corrections import ocr_corrections

In [None]:
# Choix de la méthode de validation (1 ou 2)
valid_method = 2

In [None]:
# Chargement du fichier CSV contenant les adresses e-mail
df = pd.read_csv('data/Contacts_clean.csv')
print(df.shape)
df.head()

(7450, 2)


Unnamed: 0,Nom organisation,Email contact
0,UPRIGHT AGENCY,ivictor@lOOpourlOOgolf.com
1,UPRIGHT AGENCY,ivictor@lOOpourlOOgolf.com
2,VOCATIF,vocat1f@wanadoo.fr
3,VOCATIF,vocat1f@wanadoo.fr
4,WAKE UP EVENTS,conlacl@scapevenl.com


In [None]:
# Retrait des doublons
df = df.drop_duplicates()
print(df.shape)
df.head()

(3718, 2)


Unnamed: 0,Nom organisation,Email contact
0,UPRIGHT AGENCY,ivictor@lOOpourlOOgolf.com
2,VOCATIF,vocat1f@wanadoo.fr
4,WAKE UP EVENTS,conlacl@scapevenl.com
6,WHAT'S HAP,maumus@whatshap.fr
8,WMH PROJECT,developpement@fc2events.fr


In [None]:
# Validation des adresses e-mail
def is_valid_email(email):
    is_valid = validate_email(
    email_address=email,
    check_format=True,
    check_blacklist=True,
    check_dns=False,
    dns_timeout=10,
    check_smtp=True,
    smtp_timeout=10,
    smtp_helo_host=False,
    smtp_from_address=False,
    smtp_skip_tls=True,
    smtp_tls_context=False,
    smtp_debug=False)
    
    return is_valid

In [None]:
# Corrections courantes d'OCR
def correct_ocr_errors_advanced(email, corrections_dict=ocr_corrections):
    """Corrige les erreurs OCR courantes dans les emails"""
    if not isinstance(email, str):
        return str(email) if email is not None else ""
    
    email = email.strip()
    
    if not email or '@' not in email:
        return email
    
    
    # Séparer la partie locale et le domaine
    local, domain = email.rsplit('@', 1)
    
    # Appliquer les corrections sur la partie locale et le domaine
    for wrong, correct in corrections_dict.items():
        # Correction pour les caractères simples (1, 0, 5)
        if len(wrong) == 1 and len(correct) == 1:
            # Remplacer uniquement si entouré de lettres (pas de chiffres)
            local = re.sub(rf'(?<=[a-zA-Z]){re.escape(wrong)}(?=[a-zA-Z])', correct, local)
            local = re.sub(rf'^{re.escape(wrong)}(?=[a-zA-Z])', correct, local)
            local = re.sub(rf'(?<=[a-zA-Z]){re.escape(wrong)}$', correct, local)
            
            domain = re.sub(rf'(?<=[a-zA-Z]){re.escape(wrong)}(?=[a-zA-Z])', correct, domain)
            domain = re.sub(rf'^{re.escape(wrong)}(?=[a-zA-Z])', correct, domain)
            domain = re.sub(rf'(?<=[a-zA-Z]){re.escape(wrong)}$', correct, domain)
        else:
            # Correction pour les groupes de caractères (rn, vv)
            local = local.replace(wrong, correct)
            domain = domain.replace(wrong, correct)
    
    email_corrected = f"{local}@{domain}"
    
    return email_corrected

In [None]:
# Correction des mails
df['corrected_email'] = df["Email contact"].apply(lambda email: correct_ocr_errors_advanced(email))
print(df.shape)
df.head()

(3718, 3)


Unnamed: 0,Nom organisation,Email contact,corrected_email
0,UPRIGHT AGENCY,ivictor@lOOpourlOOgolf.com,ivictor@lOOpourlOOgolf.com
2,VOCATIF,vocat1f@wanadoo.fr,vocatif@wanadoo.fr
4,WAKE UP EVENTS,conlacl@scapevenl.com,contact@scapevenl.com
6,WHAT'S HAP,maumus@whatshap.fr,maumus@whatshap.fr
8,WMH PROJECT,developpement@fc2events.fr,developpement@fc2events.fr


In [None]:
# Validation complexe
if valid_method == 1:

    # Test sur un échatillon
    df_test = df.sample(n=20, random_state=42)

    # Validation des adresses e-mail
    df_test["Valid_Email"] = df_test["Email contact"].apply(is_valid_email)
    print(df_test.shape)

    # Résumé
    df_test["Valid_Email"].value_counts()

    # Affichage
    print(df_test)

In [None]:
# Autre méthode de validation
from email_validator import validate_email, EmailNotValidError
def is_valid_email_v2(email):
    try:
        # Validate.
        v =validate_email(email, check_deliverability=True)
        # Email is valid.
        is_valid = True
        proposed_email = v["email"]
    except EmailNotValidError as e:
        # Email is not valid.
        is_valid = False
        proposed_email = None

    return pd.Series([is_valid, proposed_email])

In [None]:
# Validation plus rapide 
if valid_method == 2:
    df[['Valid_Email', 'proposed_email']] = df.apply(lambda row: is_valid_email_v2(row["corrected_email"]), axis=1)
    print(df.shape)
    df.head()

  proposed_email = v["email"]


KeyboardInterrupt: 

In [None]:
# Résumé
df["Valid_Email"].value_counts()

Valid_Email
True     2567
False    1151
Name: count, dtype: int64

In [None]:
# Nombre d'organisations uniques avec au moins 1 email valide
unique_organizations = df.groupby("Valid_Email")["Nom organisation"].nunique()
unique_organizations

Valid_Email
False     852
True     1581
Name: Nom organisation, dtype: int64

In [None]:
# Nombre de contacts a priori valide par organisation
df[df["Valid_Email"]].groupby("Nom organisation").nunique()

Unnamed: 0_level_0,Email contact,corrected_email,Valid_Email,proposed_email
Nom organisation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,3,3,1,3
ET,1,1,1,1
Miroir de l'Arl C9 Ier de chaque mois Mensuel,1,1,1,1
!FESTIVAL,1,1,1,1
0,170,168,1,168
...,...,...,...,...
tu,1,1,1,1
tü,22,22,1,22
z,2,2,1,2
Œ1',1,1,1,1


In [None]:
# Export des résultats
df.to_csv('data/Contacts_validated.csv', index=False)

In [None]:
# Export en excel
df.to_excel('data/Contacts_validated.xlsx', index=False)