In [617]:
import re
import pandas
import unidecode
import os
from marketing_data_cleaning import file_from_download_folder, DATA_FOLDER_PATH

In [618]:
TITLES = ['Capitaine', 'Lieutenant de 1ère classe', 'Commandant', 'Colonel', 'Lieutenant',
          'Capitaine', 'Lieutenant-Colonel', 'Médecin de classe normale', 'Colonel Hors-Classe',
          'Lieutenant Hors-Classe', 'Médecin - Lieutenant - Colonel', 'Contrôleur Général', 'Pharmacien Hors-Classe',
          'Infirmier', 'Pharmacien-Commandant', 'Médecin-Capitaine', 'Pharmacien 1ère classe', 'Médecin Hors-Classe',
          'Médecin de classe exceptionnelle', 'Adjudant-Chef', 'Cadre de santé de SPP de 2ème classe',
          'Médecin-Commandant', 'Infirmier de SPP de classe normale', 'Pharmacien de classe exceptionnelle',
          'Sapeur 1ere classe', 'Infirmière Cheffe', 'Lieutenant de 2ème classe', 'Medecin-Colonel', "Infirmier d'encadrement", 
          'Vétérinaire-Capitaine', 'Vétérinaire - Commandant', 'Cadre Supérieur de Santé de SPP', 'Pharmacien Classe Normale', 
          'Pharmacien-Capitaine', 'Pharmacien Lieutenant-Colonel ', 'Officier Expert', 'Cadre de santé de SPP de 1ère classe', 
          'Sergent-Chef', 'Pharmacien Colonel', 'Vétérinaire', "Infirmière d'encadrement", "Infirmier d'encadrement",
          'Infirmière Principale', 'Caporal-Chef ', 'Caporal']


In [619]:
def normalize(value):
    tokens = str(value).split(' ')
    result = ' '.join(filter(lambda x: x != '', tokens))
    return result.title().strip()

In [620]:
df = pandas.read_csv(file_from_download_folder('sdis enrich.csv'), sep=';')


Get the official title of the SDIS officer. First normalize the __Hors-Classe__ text which changes from position to position then get the true title for the officer by getting everything including the __Hors-Classe__

In [621]:
df['fullname'] = df['fullname'].replace(regex={
    r'[hH]ors(\s|\-)?[cC]lasse': 'Hors-Classe',
    r'Medecin': 'Médecin',
    r'colonel': 'Colonel',
    r'\s?-\s?': '-',
    r'chef': 'Chef',
    r'[oO]fficier\s[eE]xpert': 'Officier Expert'
})


In [622]:
df['has_classe'] = df['fullname'].map(lambda x: 'Classe' in x)

Get the title regardless of the __Hors-Classe__ and then run a second filter if we missed the __Hors-Classe__ with the first filter

In [623]:
MATCH_SEPARATOR = re.compile(r'^((?:Adjudant|Lieutenant|Commandant|Capitaine|Sergent|M[eé]decin|Infirmi[eè]re?|Vétérinaire|Contrôleur|Sapeur|Colonel|Caporal|Pharmacien|Général)\-\w+\s?)')

WITH_CLASSE = re.compile(r'^((?:Adjudant|Lieutenant|Commandant|Capitaine|Sergent|M[eé]decin|Infirmi[eè]re?|Vétérinaire|Contrôleur|Sapeur|Colonel|Caporal|Pharmacien|Général)\s?Hors\-Classe)')

IS_NURSE = re.compile(r'^(Infirmi[eè]re?\s(Principale?|Cheffe|Chef))')

IS_EXPERT = re.compile(r'(Officier\sExpert)')

WITH_CLASSE2 = re.compile(r'^(\w.*classe)\s')

MATCH_NO_SEPARATOR = re.compile(r'^((?:Adjudant|Lieutenant|Commandant|Capitaine|Sergent|M[eé]decin|Infirmi[eè]re?|Vétérinaire|Contrôleur|Sapeur|Colonel|Caporal|Pharmacien|Général)\s?)')

SPECIAL_CASE = re.compile(r'^(Contrôleur\sGénéral|^[a-zA-Zé]+\s?Chef|^[a-zA-Zé]+\s?[dD]\'encadrement)')

def get_title(value):
    if value is None:
        return None
    
    if 'exceptionnelle' in value:
        result = re.match(r'(^.*\sexceptionnelle)', str(value))
        if result:
            return result.group(1)
    
    if 'de SPP' in value:
        result = re.match(r'(^[a-zA-Z]+\sde\sSPP\s.*Classe)', str(value))
        if result:
            return result.group(1)
        
    if 'classe normale' in value:
        result = re.match(r'^(.*\sclasse\snormale)', str(value))
        if result:
            return result.group(1)
    
    result = re.match(r'^([a-zA-Zé]+\-[a-zA-Zé]+\-[a-zA-Zé]+)', str(value))
    if result:
        return result.group(1)
    
    result = IS_NURSE.match(str(value))
    if result:
        return result.group(1).strip()

    result = IS_EXPERT.match(str(value))
    if result:
        return result.group(1).strip()

    result = MATCH_SEPARATOR.match(str(value))
    if result:
        return result.group(1).strip()
    
    result = WITH_CLASSE.match(str(value))
    if result:
        return result.group(1).strip()
    
    result = WITH_CLASSE2.match(str(value))
    if result:
        return result.group(1).strip()
    
    result = SPECIAL_CASE.match(str(value))
    if result:
        return result.group(1).strip()
    
    result = MATCH_NO_SEPARATOR.match(str(value))
    if result:
        return result.group(1).strip()
    return None

df['title'] = df['fullname'].map(get_title)


Use our guess from the title to remove the position from the _fullname_ column

In [624]:
df['clean_fullname'] = None

for index, s in df.iterrows():
    if s.title:
        df.loc[index, 'clean_fullname'] = df.loc[index, 'fullname'].replace(s.title, '').strip()
    else:
        df.loc[index, 'clean_fullname'] = normalize(df.loc[index, 'fullname']).lower().capitalize()

From there we can get the officer's fullname

In [625]:
df['firstname'] = None
df['lastname'] = None

def tokens(value, index=0):
    result = str(value).rsplit(' ', maxsplit=1)[index]
    return result.lower().title()

for index, s in df.iterrows():
    if s.clean_fullname is not None:
        df.loc[index, 'firstname'] = tokens(s.clean_fullname, index=-1)
        df.loc[index, 'lastname'] = tokens(s.clean_fullname)
    else:
        df.loc[index, 'firstname'] = tokens(s.fullname, index=-1)
        df.loc[index, 'lastname'] = tokens(s.fullname)

Get the SDIS number

In [626]:
SDIS_NUMBER_REGEX = re.compile(r'(\d+)')

def get_sdis_number(value):
    if value is None:
        return None
    result = SDIS_NUMBER_REGEX.search(str(value))
    if result:
        return result.group(1)
    return value

df['sdis_number'] = df['sdis'].map(get_sdis_number)

In [627]:
df[~df['title'].isna()].tail()

Unnamed: 0,fullname,url,sdis,level,officer_page,telephone,has_classe,title,clean_fullname,firstname,lastname,sdis_number
9423,Commandant HAON Patrick,https://www.pompiercenter.com/annuaire-sdis/sd...,Organigramme SDIS 976 - Groupements fonctionne...,SPP,https://www.pompiercenter.com/fiche-pompier-ha...,02 69 63 94 73,False,Commandant,HAON Patrick,Patrick,Haon,976
9425,Contrôleur Général MARCHI-LECCIA Frédéric,https://www.pompiercenter.com/annuaire-sdis/sd...,Organigramme SDIS 988 - Groupements fonctionne...,SPP,https://www.pompiercenter.com/fiche-pompier-ma...,687-20.77.00,False,Contrôleur Général,MARCHI-LECCIA Frédéric,Frédéric,Marchi-Leccia,988
9426,Lieutenant BRASSEUR Manon,https://www.pompiercenter.com/annuaire-sdis/sd...,Organigramme SDIS 988 - Groupements fonctionne...,SPP,https://www.pompiercenter.com/fiche-pompier-br...,,False,Lieutenant,BRASSEUR Manon,Manon,Brasseur,988
9428,Capitaine DELWICHE Patrick,https://www.pompiercenter.com/annuaire-sdis/sd...,Organigramme SDIS 988 - Groupements fonctionne...,SPP,https://www.pompiercenter.com/fiche-pompier-de...,,False,Capitaine,DELWICHE Patrick,Patrick,Delwiche,988
9429,Capitaine ROSSIGNOL Alexandre,https://www.pompiercenter.com/annuaire-sdis/sd...,Organigramme SDIS 988 - Groupements fonctionne...,SPP,https://www.pompiercenter.com/fiche-pompier-ro...,00 687 78 72 28,False,Capitaine,ROSSIGNOL Alexandre,Alexandre,Rossignol,988


Some titles do not match and keep getting left in the name, they often start by a "-"

In [628]:
for index, s in df.iterrows():
    if s.clean_fullname.startswith('-'):
        result = re.search(r'^\-(\w+\-\w+)+', str(s.clean_fullname))
        if result:
            clean_name = re.sub(r'^\-(\w+\-\w+)+', ' ', str(s.clean_fullname))
            df.loc[index, 'title'] = df.loc[index, 'title'] + result.group(0)
            df.loc[index, 'clean_fullname'] = clean_name.strip()

In [629]:
df['emails'] = None

for index, s in df.iterrows():
    firstname = unidecode.unidecode(s.firstname)
    lastname = unidecode.unidecode(s.lastname)

    email = f"{firstname}.{lastname}@sdis{s.sdis_number}.fr"
    df.loc[index, 'emails'] = email.replace(' ', '').lower()


In [630]:
df = df[['clean_fullname', 'title', 'firstname', 'lastname', 'sdis',
         'sdis_number', 'officer_page', 'telephone', 'emails', 'level']]
df = df.rename(columns={'clean_fullname': 'fullname'})

In [631]:
df['fullname'] = df['fullname'].apply(normalize)

In [632]:
df.to_csv(DATA_FOLDER_PATH / 'sdis.csv', encoding='utf-8', index=False)
