## Create new Candidate entity file (single file - not three groups) with simpler synonyms and remove Pierre Maudet

**Nota Bene:**
in the Chatbot objectives the `Solution to handle spelling mistakes of candidates in chatbot users' questions` was defined as a `Should have`. Given the issues in the chatbot caused by similar synonyms, this file simplifies them to see whether we can fix the multiple entity extraction with a simpler solution.


In [127]:
# import libraries
import pandas as pd
import numpy as np

In [128]:
# import files
df = pd.read_csv('../DATA_format_JSON/candidateList_Final.csv', delimiter=";")
df.head()

Unnamed: 0,ELECTION.TYPE ELECTION,LEL.NUM DEPOT,LEL.SIGLE,LEL.LIBELLE COURT,LEL.LIBELLE LONG,CND.CIVILITE,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO
0,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Monsieur,PAGANI,Rémy,-,-
1,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Madame,WENGER,Saliha,-,Salika
2,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Monsieur,ZAUGG,Christian,-,-
3,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Madame,HALLER,Jocelyne,-,-
4,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Monsieur,ENILINE,Alexander,-,-


In [129]:
# remove dead columns
df.drop(['LEL.NUM DEPOT', 'LEL.SIGLE', 'LEL.LIBELLE COURT', 'LEL.LIBELLE LONG', 'CND.CIVILITE'], axis=1, inplace=True)
df.head()

Unnamed: 0,ELECTION.TYPE ELECTION,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO
0,GC,PAGANI,Rémy,-,-
1,GC,WENGER,Saliha,-,Salika
2,GC,ZAUGG,Christian,-,-
3,GC,HALLER,Jocelyne,-,-
4,GC,ENILINE,Alexander,-,-


In [130]:
# to title case
df['CND.NOM'] = df['CND.NOM'].str.title()
df['CND.NOM PSEUDO'] = df['CND.NOM PSEUDO'].str.title()
df.loc[74:85]

Unnamed: 0,ELECTION.TYPE ELECTION,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO
74,GC,Leonelli,Katia,-,-
75,GC,Macchiavelli,Marta,-,Marta Julia
76,GC,Makosso,William,-,-
77,GC,Martin,David,-,-
78,GC,Müller Sontag,Corinne,-,-
79,GC,Nicolet-Dit-Felix,Julien,-,-
80,GC,Noël,Philippe,-,-
81,GC,Oriolo,Alessandra,-,-
82,GC,Oztürk,Dilara,-,-
83,GC,Pasquier,Isabelle,Pasquier-Eichenberger,-


In [131]:
# rename colum and add empty columns
df.rename(index=str, columns={"ELECTION.TYPE ELECTION": "entity"}, inplace=True)
df['value'] = ''
df['synonyms'] = ''
df.head()

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
0,GC,Pagani,Rémy,-,-,,
1,GC,Wenger,Saliha,-,Salika,,
2,GC,Zaugg,Christian,-,-,,
3,GC,Haller,Jocelyne,-,-,,
4,GC,Eniline,Alexander,-,-,,


In [132]:
# add value column
df['value'] = df['CND.NOM'] + " " + df['CND.PRENOM']

In [133]:
df.head()

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
0,GC,Pagani,Rémy,-,-,Pagani Rémy,
1,GC,Wenger,Saliha,-,Salika,Wenger Saliha,
2,GC,Zaugg,Christian,-,-,Zaugg Christian,
3,GC,Haller,Jocelyne,-,-,Haller Jocelyne,
4,GC,Eniline,Alexander,-,-,Eniline Alexander,


In [134]:
# add synonym column
df['synonyms'] = (df['CND.PRENOM'] + " " + df['CND.NOM'] + ":" 
                  + (np.where(df['CND.NOM PSEUDO'] != '-'
                              , df['CND.PRENOM'] + " " + df['CND.NOM PSEUDO'], '')) 
                  + (np.where(df['CND.PRENOM PSEUDO'] != '-'
                              , df['CND.PRENOM PSEUDO'] + " " + df['CND.NOM'], '')) 
                  + (np.where((df['CND.NOM PSEUDO'] != '-') & (df['CND.PRENOM PSEUDO'] != '-')
                              , df['CND.PRENOM PSEUDO'] + " " + df['CND.NOM PSEUDO'], ''))) 
df.iloc[55:58]

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
55,GC,Boujemaa,Sanae,-,-,Boujemaa Sanae,Sanae Boujemaa:
56,GC,Galland,Eduardo,-,-,Galland Eduardo,Eduardo Galland:
57,GC,Jelk,Andrée,Jelk-Peila,-,Jelk Andrée,Andrée Jelk:Andrée Jelk-Peila


In [135]:
# remove last ':' if present
df['synonyms'] = df['synonyms'].str.rstrip('\:')
df.iloc[55:58]

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
55,GC,Boujemaa,Sanae,-,-,Boujemaa Sanae,Sanae Boujemaa
56,GC,Galland,Eduardo,-,-,Galland Eduardo,Eduardo Galland
57,GC,Jelk,Andrée,Jelk-Peila,-,Jelk Andrée,Andrée Jelk:Andrée Jelk-Peila


In [136]:
# remove dead columns
df.drop(['CND.NOM', 'CND.PRENOM', 'CND.NOM PSEUDO', 'CND.PRENOM PSEUDO'], axis=1, inplace=True)
df.head()

Unnamed: 0,entity,value,synonyms
0,GC,Pagani Rémy,Rémy Pagani
1,GC,Wenger Saliha,Saliha Wenger:Salika Wenger
2,GC,Zaugg Christian,Christian Zaugg
3,GC,Haller Jocelyne,Jocelyne Haller
4,GC,Eniline Alexander,Alexander Eniline


In [137]:
df.tail(1)

Unnamed: 0,entity,value,synonyms
653,CE,Amsler Susanne,Susanne Amsler


In [138]:
df['entity'] = 'Candidats_CE'
df.head()

Unnamed: 0,entity,value,synonyms
0,Candidats_CE,Pagani Rémy,Rémy Pagani
1,Candidats_CE,Wenger Saliha,Saliha Wenger:Salika Wenger
2,Candidats_CE,Zaugg Christian,Christian Zaugg
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller
4,Candidats_CE,Eniline Alexander,Alexander Eniline


In [139]:
# drop duplicate rows
df.drop_duplicates(subset=None, keep='first', inplace=True)
len(df)

627

In [140]:
df.head()

Unnamed: 0,entity,value,synonyms
0,Candidats_CE,Pagani Rémy,Rémy Pagani
1,Candidats_CE,Wenger Saliha,Saliha Wenger:Salika Wenger
2,Candidats_CE,Zaugg Christian,Christian Zaugg
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller
4,Candidats_CE,Eniline Alexander,Alexander Eniline


In [141]:
# add colum for synonyms with no accents - copy synonmys
df['syn-no-accents'] = df['synonyms']
df.head()

Unnamed: 0,entity,value,synonyms,syn-no-accents
0,Candidats_CE,Pagani Rémy,Rémy Pagani,Rémy Pagani
1,Candidats_CE,Wenger Saliha,Saliha Wenger:Salika Wenger,Saliha Wenger:Salika Wenger
2,Candidats_CE,Zaugg Christian,Christian Zaugg,Christian Zaugg
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller,Jocelyne Haller
4,Candidats_CE,Eniline Alexander,Alexander Eniline,Alexander Eniline


In [142]:
# remove accents and spec characters
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='à',value='a')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='â',value='a')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ä',value='a')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='é',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='è',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ê',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ë',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ì',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='î',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ï',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='í',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ò',value='o')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ô',value='o')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ö',value='o')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ù',value='u')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='û',value='u')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ü',value='u')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ç',value='c')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace="-",value=' ')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace="'",value=' ')

In [143]:
df.head()

Unnamed: 0,entity,value,synonyms,syn-no-accents
0,Candidats_CE,Pagani Rémy,Rémy Pagani,Remy Pagani
1,Candidats_CE,Wenger Saliha,Saliha Wenger:Salika Wenger,Saliha Wenger:Salika Wenger
2,Candidats_CE,Zaugg Christian,Christian Zaugg,Christian Zaugg
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller,Jocelyne Haller
4,Candidats_CE,Eniline Alexander,Alexander Eniline,Alexander Eniline


In [144]:
# merge with and without accents columns
df['synonyms'] = df['synonyms'] + ":" + df['syn-no-accents']

In [145]:
df.head()

Unnamed: 0,entity,value,synonyms,syn-no-accents
0,Candidats_CE,Pagani Rémy,Rémy Pagani:Remy Pagani,Remy Pagani
1,Candidats_CE,Wenger Saliha,Saliha Wenger:Salika Wenger:Saliha Wenger:Sali...,Saliha Wenger:Salika Wenger
2,Candidats_CE,Zaugg Christian,Christian Zaugg:Christian Zaugg,Christian Zaugg
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller:Jocelyne Haller,Jocelyne Haller
4,Candidats_CE,Eniline Alexander,Alexander Eniline:Alexander Eniline,Alexander Eniline


In [146]:
# drop dead columns
df.drop(['syn-no-accents'], axis=1, inplace=True)
df.head()

Unnamed: 0,entity,value,synonyms
0,Candidats_CE,Pagani Rémy,Rémy Pagani:Remy Pagani
1,Candidats_CE,Wenger Saliha,Saliha Wenger:Salika Wenger:Saliha Wenger:Sali...
2,Candidats_CE,Zaugg Christian,Christian Zaugg:Christian Zaugg
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller:Jocelyne Haller
4,Candidats_CE,Eniline Alexander,Alexander Eniline:Alexander Eniline


In [147]:
# remove Pierre Maudet
df[df.value != 'Maudet Pierre']
df.iloc[195:200]

Unnamed: 0,entity,value,synonyms
195,Candidats_CE,Burkhard Roland,Roland Burkhard:Roland Burkhard
196,Candidats_CE,Bosshard Pierre Yves,Pierre Yves Bosshard:Pierre Yves Bosshard
197,Candidats_CE,Kurteshi Ismet,Ismet Kurteshi:Ismet Kurteshi
198,Candidats_CE,Maudet Pierre,Pierre Maudet:Pierre Maudet
199,Candidats_CE,Fontanet Nathalie,Nathalie Fontanet:Nathalie Fontanet


In [148]:
# keep only selected candidates for 2nd round
df = df.loc[df['value'].isin(['Aymon Paul','Fontanet Nathalie'
                         ,'Cretegny Willy','Apothéloz Thierry'
                         ,'Poggia Mauro','Dal Busco Serge'
                         ,'Hodgers Antonio','Haller Jocelyne'
                         ,'Nidegger Yves','Barthassat Luc'
                         ,'Emery-Torracinta Anne'])]
df

Unnamed: 0,entity,value,synonyms
3,Candidats_CE,Haller Jocelyne,Jocelyne Haller:Jocelyne Haller
68,Candidats_CE,Hodgers Antonio,Antonio Hodgers:Antonio Hodgers
144,Candidats_CE,Emery-Torracinta Anne,Anne Emery-Torracinta:Anne Emery Torracinta
145,Candidats_CE,Apothéloz Thierry,Thierry Apothéloz:Thierry Apotheloz
199,Candidats_CE,Fontanet Nathalie,Nathalie Fontanet:Nathalie Fontanet
278,Candidats_CE,Barthassat Luc,Luc Barthassat:Luc Barthassat
279,Candidats_CE,Dal Busco Serge,Serge Dal Busco:Serge Dal Busco
354,Candidats_CE,Nidegger Yves,Yves Nidegger:Yves Nidegger
492,Candidats_CE,Poggia Mauro,Mauro Poggia:Mauro Poggia
641,Candidats_CE,Aymon Paul,Paul Aymon:dit Paul Sierre Aymon:Paul Aymon:di...


In [149]:
df.reset_index(inplace=True,drop=True)
df

Unnamed: 0,entity,value,synonyms
0,Candidats_CE,Haller Jocelyne,Jocelyne Haller:Jocelyne Haller
1,Candidats_CE,Hodgers Antonio,Antonio Hodgers:Antonio Hodgers
2,Candidats_CE,Emery-Torracinta Anne,Anne Emery-Torracinta:Anne Emery Torracinta
3,Candidats_CE,Apothéloz Thierry,Thierry Apothéloz:Thierry Apotheloz
4,Candidats_CE,Fontanet Nathalie,Nathalie Fontanet:Nathalie Fontanet
5,Candidats_CE,Barthassat Luc,Luc Barthassat:Luc Barthassat
6,Candidats_CE,Dal Busco Serge,Serge Dal Busco:Serge Dal Busco
7,Candidats_CE,Nidegger Yves,Yves Nidegger:Yves Nidegger
8,Candidats_CE,Poggia Mauro,Mauro Poggia:Mauro Poggia
9,Candidats_CE,Aymon Paul,Paul Aymon:dit Paul Sierre Aymon:Paul Aymon:di...


In [150]:
# export csv files with and without accents
df.to_csv('../electionBot_sBox/entities/candidates-Entities_v12.csv',index=False)