# Normalización de nombres de pila con lógica difusa

- Sobre [lógica difusa](https://es.wikipedia.org/wiki/Lógica_difusa)
- Sobre [distancia Levenshtein](https://es.wikipedia.org/wiki/Distancia_de_Levenshtein)
- Sobre la [Ley de Zipf](https://es.wikipedia.org/wiki/Ley_de_Zipf)
- Listado de [nombres de pila](https://www.ine.es/dyngs/INEbase/es/operacion.htm?c=Estadistica_C&cid=1254736177009&menu=resultados&idp=1254734710990#!tabs-1254736195454)

In [15]:
import re
import time
import pandas as pd
from fuzzywuzzy import process

## Obtención de datos

In [16]:
WOMEN= pd.read_csv('./data/women.txt', delimiter='\t')
WOMEN.head(5)

Unnamed: 0,Orden,Nombre,Frecuencia,Edad Media (*)
0,1,MARIA CARMEN,656276,570
1,2,MARIA,606048,486
2,3,CARMEN,391563,604
3,4,JOSEFA,276682,680
4,5,ANA MARIA,273319,512


In [17]:
MEN= pd.read_csv('./data/men.txt', delimiter='\t')
MEN.head(10)

Unnamed: 0,Orden,Nombre,Frecuencia,Edad Media (*)
0,1,ANTONIO,678425,559
1,2,JOSE,594144,611
2,3,MANUEL,590965,549
3,4,FRANCISCO,498934,574
4,5,DAVID,365196,305
5,6,JUAN,346867,557
6,7,JOSE ANTONIO,310134,496
7,8,JAVIER,306504,330
8,9,DANIEL,296281,270
9,10,JOSE LUIS,293564,533


## Normalización

In [18]:
name = 'Mª JOSE'
suggestions = process.extract(name, WOMEN.Nombre.array, limit=3)
suggestions

[('MIRIAM JOSEFINA', 90), ('MIRIAM JOSE', 90), ('MARIA JOSE', 86)]

## Heurística

In [19]:
suggestion_names = [suggestion[0] for suggestion in suggestions]
suggestion_names

['MIRIAM JOSEFINA', 'MIRIAM JOSE', 'MARIA JOSE']

In [20]:
suggestion_rows = WOMEN[WOMEN.Nombre.isin(suggestion_names)]
suggestion_rows

Unnamed: 0,Orden,Nombre,Frecuencia,Edad Media (*)
17,18,MARIA JOSE,203283,461
13205,13206,MIRIAM JOSEFINA,42,540
16750,16751,MIRIAM JOSE,32,417


In [22]:
suggestion_names = [suggestion[0] for suggestion in suggestions]
suggestion_rows = WOMEN[WOMEN.Nombre.isin(suggestion_names)]
suggestion_rows.Nombre.array[0]

<PandasArray>
['MARIA JOSE', 'MIRIAM JOSEFINA', 'MIRIAM JOSE']
Length: 3, dtype: object

In [24]:
def normalize(name, candidates_df, limit=3):
    candidates = candidates_df.Nombre.array
    suggestions = process.extract(name, candidates, limit=limit)
    suggestion_names = [suggestion[0] for suggestion in suggestions]
    suggestion_rows = candidates_df[candidates_df.Nombre.isin(suggestion_names)]
    best_suggestion = suggestion_rows.Nombre.array[0]
    return best_suggestion

In [27]:
name = 'Mª JOSE'
normalize(name, WOMEN, limit=3)

'MARIA JOSE'

## Optimización

In [25]:
WOMEN.shape

(25658, 4)

In [26]:
name = 'JOSE ANTONO'
start = time.time()
suggestions = process.extract(name, MEN.Nombre.array, limit=3)
end = time.time()
end - start

0.9954941272735596

In [27]:
WOMEN = WOMEN[WOMEN.Frecuencia > 99]
MEN = MEN[MEN.Frecuencia > 99]

In [28]:
name = 'JOSE ANTONO'
start = time.time()
suggestions = process.extract(name, MEN.Nombre.array, limit=3)
end = time.time()
end - start

0.2753720283508301

In [29]:
MEN.shape

(6795, 4)

In [30]:
MEN.tail(10)

Unnamed: 0,Orden,Nombre,Frecuencia,Edad Media (*)
6785,6786,SADDIK,100,437
6786,6787,SANTIAGO AGUSTIN,100,351
6787,6788,SANTIAGO JOAQUIN,100,431
6788,6789,SIDI AHMED,100,429
6789,6790,SOHAYB,100,76
6790,6791,TALAL,100,307
6791,6792,VICENT JOSEP,100,453
6792,6793,WILLIAM ALBERTO,100,398
6793,6794,YADEL,100,46
6794,6795,ZHIWEI,100,290


## Adaptación del input

In [31]:
name = 'María'
suggestions = process.extract(name, WOMEN.Nombre.array, limit=3)
print(suggestions)

[('MARA', 100), ('TAMARA', 90), ('SAMARA', 90)]


In [32]:
WOMEN[WOMEN.Nombre.str.contains('MARIA', na=False)].head(10)

Unnamed: 0,Orden,Nombre,Frecuencia,Edad Media (*)
0,1,MARIA CARMEN,656276,570
1,2,MARIA,606048,486
4,5,ANA MARIA,273319,512
6,7,MARIA PILAR,263141,570
7,8,MARIA DOLORES,259216,566
9,10,MARIA TERESA,251492,571
12,13,MARIA ANGELES,226047,554
16,17,MARIA ISABEL,204354,528
17,18,MARIA JOSE,203283,461
23,24,MARIA LUISA,160565,609


In [36]:
def adapt(name):
    name = name.lower()
    name = name.translate(str.maketrans({'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'ñ': 'n'}))
    name = re.sub('\s+del?(\s+l[oa]s?)?\s+', ' ', name)
    return name

In [37]:
adapt('María')

'maria'

In [2]:
def normalize(name, candidates_df, limit=3):

    def adapt(name):
        name = name.lower()
        name = name.translate(str.maketrans({'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u', 'ñ': 'n'}))
        name = re.sub('\s+del?(\s+l[oa]s?)?\s+', ' ', name)
        return name

    suggestions = process.extract(adapt(name), candidates_df.Nombre.array, limit=3)
    if suggestions[0][1] == 100:
        return suggestions[0][0]
    suggestion_names = [suggestion[0] for suggestion in suggestions]
    suggestion_rows = candidates_df[candidates_df.Nombre.isin(suggestion_names)]
    best_suggestion = suggestion_rows.Nombre.array[0]

    return best_suggestion
