## Create new Candidate entity file (single file - not three groups) with simpler synonyms

**Nota Bene:**
in the Chatbot objectives the `Solution to handle spelling mistakes of candidates in chatbot users' questions` was defined as a `Should have`. Given the issues in the chatbot caused by similar synonyms, this file simplifies them to see whether we can fix the multiple entity extraction with a simpler solution.


In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# import files
df = pd.read_csv('./DATA_format_JSON/candidateList_Final.csv', delimiter=";")
df.head()

Unnamed: 0,ELECTION.TYPE ELECTION,LEL.NUM DEPOT,LEL.SIGLE,LEL.LIBELLE COURT,LEL.LIBELLE LONG,CND.CIVILITE,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO
0,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Monsieur,PAGANI,Rémy,-,-
1,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Madame,WENGER,Saliha,-,Salika
2,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Monsieur,ZAUGG,Christian,-,-
3,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Madame,HALLER,Jocelyne,-,-
4,GC,1,EAG,ENSEMBLE À GAUCHE,ENSEMBLE À GAUCHE,Monsieur,ENILINE,Alexander,-,-


In [3]:
# remove dead columns
df.drop(['LEL.NUM DEPOT', 'LEL.SIGLE', 'LEL.LIBELLE COURT', 'LEL.LIBELLE LONG', 'CND.CIVILITE'], axis=1, inplace=True)
df.head()

Unnamed: 0,ELECTION.TYPE ELECTION,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO
0,GC,PAGANI,Rémy,-,-
1,GC,WENGER,Saliha,-,Salika
2,GC,ZAUGG,Christian,-,-
3,GC,HALLER,Jocelyne,-,-
4,GC,ENILINE,Alexander,-,-


In [4]:
# to title case
df['CND.NOM'] = df['CND.NOM'].str.title()
df['CND.NOM PSEUDO'] = df['CND.NOM PSEUDO'].str.title()
df.loc[74:85]

Unnamed: 0,ELECTION.TYPE ELECTION,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO
74,GC,Leonelli,Katia,-,-
75,GC,Macchiavelli,Marta,-,Marta Julia
76,GC,Makosso,William,-,-
77,GC,Martin,David,-,-
78,GC,Müller Sontag,Corinne,-,-
79,GC,Nicolet-Dit-Felix,Julien,-,-
80,GC,Noël,Philippe,-,-
81,GC,Oriolo,Alessandra,-,-
82,GC,Oztürk,Dilara,-,-
83,GC,Pasquier,Isabelle,Pasquier-Eichenberger,-


In [5]:
# rename colum and add empty columns
df.rename(index=str, columns={"ELECTION.TYPE ELECTION": "entity"}, inplace=True)
df['value'] = ''
df['synonyms'] = ''
df.head()

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
0,GC,Pagani,Rémy,-,-,,
1,GC,Wenger,Saliha,-,Salika,,
2,GC,Zaugg,Christian,-,-,,
3,GC,Haller,Jocelyne,-,-,,
4,GC,Eniline,Alexander,-,-,,


In [6]:
# add value column
df['value'] = df['CND.NOM'] + " " + df['CND.PRENOM']

In [7]:
df.head()

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
0,GC,Pagani,Rémy,-,-,Pagani Rémy,
1,GC,Wenger,Saliha,-,Salika,Wenger Saliha,
2,GC,Zaugg,Christian,-,-,Zaugg Christian,
3,GC,Haller,Jocelyne,-,-,Haller Jocelyne,
4,GC,Eniline,Alexander,-,-,Eniline Alexander,


In [8]:
# add synonym column
df['synonyms'] = (df['CND.PRENOM'] + " " + df['CND.NOM'] + ":" 
                  + (np.where(df['CND.NOM PSEUDO'] != '-'
                              , df['CND.PRENOM'] + " " + df['CND.NOM PSEUDO'], '')) 
                  + (np.where(df['CND.PRENOM PSEUDO'] != '-'
                              , df['CND.PRENOM PSEUDO'] + " " + df['CND.NOM'], '')) 
                  + (np.where((df['CND.NOM PSEUDO'] != '-') & (df['CND.PRENOM PSEUDO'] != '-')
                              , df['CND.PRENOM PSEUDO'] + " " + df['CND.NOM PSEUDO'], ''))) 
df.iloc[55:58]

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
55,GC,Boujemaa,Sanae,-,-,Boujemaa Sanae,Sanae Boujemaa:
56,GC,Galland,Eduardo,-,-,Galland Eduardo,Eduardo Galland:
57,GC,Jelk,Andrée,Jelk-Peila,-,Jelk Andrée,Andrée Jelk:Andrée Jelk-Peila


In [9]:
# remove last ':' if present
df['synonyms'] = df['synonyms'].str.rstrip('\:')
df.iloc[55:58]

Unnamed: 0,entity,CND.NOM,CND.PRENOM,CND.NOM PSEUDO,CND.PRENOM PSEUDO,value,synonyms
55,GC,Boujemaa,Sanae,-,-,Boujemaa Sanae,Sanae Boujemaa
56,GC,Galland,Eduardo,-,-,Galland Eduardo,Eduardo Galland
57,GC,Jelk,Andrée,Jelk-Peila,-,Jelk Andrée,Andrée Jelk:Andrée Jelk-Peila


In [10]:
# remove dead columns
df.drop(['CND.NOM', 'CND.PRENOM', 'CND.NOM PSEUDO', 'CND.PRENOM PSEUDO'], axis=1, inplace=True)
df.head()

Unnamed: 0,entity,value,synonyms
0,GC,Pagani Rémy,Rémy Pagani
1,GC,Wenger Saliha,Saliha Wenger:Salika Wenger
2,GC,Zaugg Christian,Christian Zaugg
3,GC,Haller Jocelyne,Jocelyne Haller
4,GC,Eniline Alexander,Alexander Eniline


In [11]:
df.tail(1)

Unnamed: 0,entity,value,synonyms
653,CE,Amsler Susanne,Susanne Amsler


In [12]:
df['entity'] = 'Candidat'
df.head()

Unnamed: 0,entity,value,synonyms
0,Candidat,Pagani Rémy,Rémy Pagani
1,Candidat,Wenger Saliha,Saliha Wenger:Salika Wenger
2,Candidat,Zaugg Christian,Christian Zaugg
3,Candidat,Haller Jocelyne,Jocelyne Haller
4,Candidat,Eniline Alexander,Alexander Eniline


In [13]:
# drop duplicate rows
df.drop_duplicates(subset=None, keep='first', inplace=True)
len(df)

627

In [15]:
df.head()

Unnamed: 0,entity,value,synonyms
0,Candidat,Pagani Rémy,Rémy Pagani
1,Candidat,Wenger Saliha,Saliha Wenger:Salika Wenger
2,Candidat,Zaugg Christian,Christian Zaugg
3,Candidat,Haller Jocelyne,Jocelyne Haller
4,Candidat,Eniline Alexander,Alexander Eniline


In [16]:
# export csv files with accents
df.to_csv('./electionBot_sBox/GBarthelet_chatbot/entities/candidates-Entities_v10_wAccents.csv',index=False)

In [17]:
# add colum for synonyms with no accents - copy synonmys
df['syn-no-accents'] = df['synonyms']
df.head()

Unnamed: 0,entity,value,synonyms,syn-no-accents
0,Candidat,Pagani Rémy,Rémy Pagani,Rémy Pagani
1,Candidat,Wenger Saliha,Saliha Wenger:Salika Wenger,Saliha Wenger:Salika Wenger
2,Candidat,Zaugg Christian,Christian Zaugg,Christian Zaugg
3,Candidat,Haller Jocelyne,Jocelyne Haller,Jocelyne Haller
4,Candidat,Eniline Alexander,Alexander Eniline,Alexander Eniline


In [18]:
# remove accents and spec characters
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='à',value='a')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='â',value='a')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ä',value='a')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='é',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='è',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ê',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ë',value='e')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ì',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='î',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ï',value='i')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ò',value='o')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ô',value='o')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ö',value='o')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ù',value='u')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='û',value='u')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace='ü',value='u')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace="-",value=' ')
df['syn-no-accents'].replace(regex=True,inplace=True,to_replace="'",value=' ')

In [19]:
df.head()

Unnamed: 0,entity,value,synonyms,syn-no-accents
0,Candidat,Pagani Rémy,Rémy Pagani,Remy Pagani
1,Candidat,Wenger Saliha,Saliha Wenger:Salika Wenger,Saliha Wenger:Salika Wenger
2,Candidat,Zaugg Christian,Christian Zaugg,Christian Zaugg
3,Candidat,Haller Jocelyne,Jocelyne Haller,Jocelyne Haller
4,Candidat,Eniline Alexander,Alexander Eniline,Alexander Eniline


In [20]:
# merge with and without accents columns
df['synonyms'] = df['synonyms'] + ":" + df['syn-no-accents']

In [21]:
df.head()

Unnamed: 0,entity,value,synonyms,syn-no-accents
0,Candidat,Pagani Rémy,Rémy Pagani:Remy Pagani,Remy Pagani
1,Candidat,Wenger Saliha,Saliha Wenger:Salika Wenger:Saliha Wenger:Sali...,Saliha Wenger:Salika Wenger
2,Candidat,Zaugg Christian,Christian Zaugg:Christian Zaugg,Christian Zaugg
3,Candidat,Haller Jocelyne,Jocelyne Haller:Jocelyne Haller,Jocelyne Haller
4,Candidat,Eniline Alexander,Alexander Eniline:Alexander Eniline,Alexander Eniline


In [22]:
# drop dead columns
df.drop(['syn-no-accents'], axis=1, inplace=True)
df.head()

Unnamed: 0,entity,value,synonyms
0,Candidat,Pagani Rémy,Rémy Pagani:Remy Pagani
1,Candidat,Wenger Saliha,Saliha Wenger:Salika Wenger:Saliha Wenger:Sali...
2,Candidat,Zaugg Christian,Christian Zaugg:Christian Zaugg
3,Candidat,Haller Jocelyne,Jocelyne Haller:Jocelyne Haller
4,Candidat,Eniline Alexander,Alexander Eniline:Alexander Eniline


In [40]:
# remove duplicate in string
df['synonyms'][1].

['Saliha', 'Wenger:Salika', 'Wenger:Saliha', 'Wenger:Salika', 'Wenger']

In [None]:
# remove last ":"


In [None]:
# export csv files with and without accents
df.to_csv('./electionBot_sBox/entities/candidates-Entities_v11.csv',index=False)