In [1]:
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import re
import pandas as pd
import csv
import difflib
pd.options.display.max_colwidth = 1000

### csv con los datos scrappeados de https://codigo-postal.co/argentina/

In [2]:
df = pd.read_csv('df-backup2.csv')
df = df.drop('index', axis=1)
df = df.drop('Unnamed: 0', axis=1)
df.shape

(2523464, 8)

#### limpio los datos

In [3]:
def removeURL(x, num):
    prov = x.split('/')
    if (len(prov)==1):
        return prov[0]
    return prov[num]

df['provincia'] = df['provincia'].apply(lambda x: removeURL(x, 4))
df['localidad'] = df['localidad'].apply(lambda x: removeURL(x, 5))
df['localidad'] = df['localidad'].str.replace('-', ' ')
df['localidad'] = df['localidad'].str.lower()
df['calle'] = df['calle'].str.lower()

df.tail(1)

Unnamed: 0,calle,desde,hasta,par,cp,cpa,provincia,localidad
2523463,calle zuloaga,902,1000,True,1824,B1824OHH,buenos aires,lanus


### csv con users de Lemon

In [4]:
df_lemon = pd.read_csv('lemoncash_ar_muestra.csv')
df_lemon.columns = ['calle', 'num', 'localidad', 'prov', 'cp']
df_lemon = df_lemon[df_lemon['calle'].notnull()]
df_lemon['calle'] = df_lemon['calle'].str.lower()
df_lemon['num'] = df_lemon['num'].fillna(0).astype(int)
df_lemon['par'] = (df_lemon['num'] % 2) == 0
df_lemon['cp'] = pd.to_numeric(df_lemon['cp'])
df_lemon.shape

(4152, 6)

#### normalizo todo para sacar tildes y ñ

In [5]:
def normalize(s):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ñ", "n"),
        ("ü", "u")
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s

df_lemon['calle'] = df_lemon['calle'].apply(lambda x: normalize(str(x)))
df['provincia'] = df['provincia'].apply(lambda x: normalize(str(x)))
df['localidad'] = df['localidad'].apply(lambda x: normalize(str(x)))
df['calle'] = df['calle'].apply(lambda x: normalize(str(x)))

#### Busco la letra de la prov (primer caracter del CPA)

In [6]:
def province_letter(y):
    x = y['prov']
    if x == 'Salta':
        return 'A'
    elif x == 'Provincia de Buenos Aires':
        return 'B'
    elif x == 'Buenos Aires':
        return 'C'
    elif x == 'San Luis':
        return 'D'
    elif x == 'Entre Ríos':
        return 'E'
    elif x == 'La Rioja':
        return 'F'
    elif x == 'Santiago del Estero':
        return 'G'
    elif x == 'Chaco':
        return 'H'
    elif x == 'San Juan':
        return 'J'
    elif x == 'Catamarca':
        return 'K'
    elif x == 'La Pampa':
        return 'L'
    elif x == 'Mendoza':
        return 'M'
    elif x == 'Misiones':
        return 'N'
    elif x == 'Formosa':
        return 'P'
    elif x == 'Neuquén':
        return 'Q'
    elif x == 'Río Negro':
        return 'R'
    elif x == 'Santa Fe':
        return 'S'
    elif x == 'Tucumán':
        return 'T'
    elif x == 'Chubut':
        return 'U'
    elif x == 'Tierra del Fuego':
        return 'V'
    elif x == 'Corrientes':
        return 'W'
    elif x == 'Córdoba':
        return 'X'
    elif x == 'Jujuy':
        return 'Y'
    elif x == 'Santa Cruz':
        return 'Z'
    return ''

In [7]:
df_lemon['letra'] = df_lemon.apply(lambda x: province_letter(x), axis=1)

In [8]:
## filtro los que no tengo una prov conocida (brasil)
df_lemon = df_lemon[df_lemon['letra'] != '']
df_lemon.shape

(4142, 7)

In [9]:
df['alt-cp'] = df['cpa'].str[1:5]
df['alt-cp'] = pd.to_numeric(df['alt-cp'])

### Funciones para buscar la mejor coincidencia de nombre de calle


In [10]:
### Busca la calle más parecida dentro del subconjunto con el mismo CP
def closest_match_difflib(x):
    try:
        df_new = df.loc[df['cp'] == x['cp']]
        bestmatch = difflib.get_close_matches(str(x['calle']), df_new['calle'], 1, 0.4)
        if len(bestmatch)>0:
            return bestmatch[0]
        return ''
    except:
        return ''

In [11]:
## Idem anterior pero con otro algoritmo
def closest_match_fuzzywuzzy(x):
    try:
        df_new = df.loc[x['letra']==df['cpa'].str[0:1]]
        df_new = df.loc[df['cp'] == x['cp']]
        bestmatch = process.extract(str(x['calle']), df_new['calle'], scorer=fuzz.token_sort_ratio, limit=1)
        if len(bestmatch)>0 and bestmatch[0][1]>60:
            return bestmatch[0][0]
        return ''
    except:
        return ''

## Idem anterior pero con otro algoritmo
def closest_match_fuzzywuzzy2(x):
    try:
        df_new = df.loc[x['letra']==df['cpa'].str[0:1]]
        df_new = df_new.loc[df_new['cp'] == x['cp']]
        bestmatch = process.extract(str(x['calle']), df_new['calle'], scorer=fuzz.token_set_ratio, limit=1)
        if len(bestmatch)>0 and bestmatch[0][1]>60:
            return bestmatch[0][0]
        return ''
    except:
        return ''


In [12]:
#df_lemon['bestmatch_difflib'] = df_lemon.apply(lambda x: closest_match_difflib(x), axis=1)
#df_lemon['bestmatch_fuzzywuzzy'] = df_lemon.apply(lambda x: closest_match_fuzzywuzzy(x), axis=1)
df_lemon['bestmatch_fuzzywuzzy2'] = df_lemon.apply(lambda x: closest_match_fuzzywuzzy2(x), axis=1)
df_lemon.head(5)



Unnamed: 0,calle,num,localidad,prov,cp,par,letra,bestmatch_fuzzywuzzy2
0,barrio luis vernet,82,Rawson,Chubut,9103.0,True,U,
1,86 pelagio b. luna,2640,San Andres,Provincia de Buenos Aires,1651.0,True,B,
2,lavalle,838,San Nicolás de Los Arroyos,Provincia de Buenos Aires,2900.0,True,B,calle lavalle
3,italia este,15,Corral de Bustos,Córdoba,2645.0,False,X,avenida italia
4,jose contin,1136,Puerto Madryn,Chubut,1136.0,True,U,


In [13]:
## a cuantos le encontré match
df_lemon[df_lemon['bestmatch_fuzzywuzzy2']!=''].shape

(2884, 8)

### Funcion para buscar el CPA

In [14]:
def find_CPA(x):
    if x['bestmatch_fuzzywuzzy2'] == '':
        return ''
    
    ## Busco dentro del mismo cp
    df_new = df[df['cp'] == x['cp']]
    df_new = df_new[df_new['calle'] == x['bestmatch_fuzzywuzzy2']]
    df_new = df_new[(df_new["desde"]<=x['num']) & (df_new["hasta"]>=x['num'])]
    df_new = df_new[df_new['par']==x['par']]
    if len(df_new) != 0:
        return df_new['cpa'].tolist()
    
    ## Busco dentro del mismo cpa
    df_new = df[df['cpa'].str[0:1] == x['cp']]
    df_new = df_new[df_new['calle'] == x['bestmatch_fuzzywuzzy2']]
    df_new = df_new[(df_new["desde"]<=x['num']) & (df_new["hasta"]>=x['num'])]
    df_new = df_new[df_new['par']==x['par']]
    if len(df_new) != 0:
        return df_new['cpa'].tolist()
    
    ## Busco dentro de la misma letra
    df_new = df[df['cpa'].str[0:1] == x['letra']]
    df_new = df_new[df_new['calle'] == x['bestmatch_fuzzywuzzy2']]
    df_new = df_new[(df_new["desde"]<=x['num']) & (df_new["hasta"]>=x['num'])]
    df_new = df_new[df_new['par']==x['par']]
    return df_new['cpa'].tolist()

In [15]:
pd.options.mode.chained_assignment = None
df_lemon['cpa'] = df_lemon.apply(lambda x: find_CPA(x), axis=1)

In [18]:
def to_unique_list(x):
    return list(dict.fromkeys(x['cpa']))

In [21]:
## de cuantos encontré el CPA 
df_lemon['cpa'] = df_lemon.apply(lambda x: to_unique_list(x), axis=1)
df_lemon[df_lemon['cpa'].apply(lambda x: len(x) == 1)].shape

(2283, 9)

### Busco potenciales CPAs

In [22]:
def find_CPAs(x):
    if len(x['cpa'])>0: # si ya encontré antes no hago nada.
        return x['cpa']
    
    df_new = df[(df['cp'] == x['cp']) | (df['alt-cp'] == x['cp'])]
    df_new = df_new[df_new['cpa'].str[0:1] == x['letra']]
    return df_new['cpa'].tolist()

In [23]:
pd.options.mode.chained_assignment = None
df_lemon['cpa'] = df_lemon.apply(lambda x: find_CPAs(x), axis=1)

In [24]:
df_lemon['cpa'] = df_lemon.apply(lambda x: to_unique_list(x), axis=1)

### analizo los que no encontré un CPA único

In [28]:
df_withou_cpa = df_lemon[df_lemon['cpa'].apply(lambda x: len(x) != 1)]
print('total sin CPA ' + str(df_withou_cpa.shape))

total sin CPA (1848, 9)


In [29]:
## Hago una segunda busqueda por calles en toda la provincia
def closest_match_fuzzywuzzy2_prov(x):
    if len(x['cpa'])==1: # si ya encontré antes no hago nada.
        return x['bestmatch_fuzzywuzzy2']
    if x['bestmatch_fuzzywuzzy2'] != '':
        return x['bestmatch_fuzzywuzzy2']
    try:
        df_new = df.loc[x['letra']==df['cpa'].str[0:1]]
        bestmatch = process.extract(str(x['calle']), df_new['calle'], scorer=fuzz.token_set_ratio, limit=1)
        if len(bestmatch)>0 and bestmatch[0][1]>60:
            return bestmatch[0][0]
        return ''
    except:
        return ''
    
df_lemon['bestmatch_fuzzywuzzy2'] = df_lemon.apply(lambda x: closest_match_fuzzywuzzy2_prov(x), axis=1)    




In [30]:
### Casos en los que tengo un match del nombre de la calle
df_withou_cpa = df_lemon[df_lemon['cpa'].apply(lambda x: len(x) != 1)]
print('total sin CPA ' + str(df_withou_cpa.shape))
df_lemon_new1 = df_withou_cpa[df_withou_cpa['bestmatch_fuzzywuzzy2']!='']
print('tiene num ' + str(df_lemon_new1[df_lemon_new1['num']!=0].shape))
print('no tiene num ' + str(df_lemon_new1[df_lemon_new1['num'] == 0].shape))


total sin CPA (1848, 9)
tiene num (1227, 9)
no tiene num (429, 9)


In [31]:
def search_by_street_and_cp(x):
    if len(x['cpa'])==1: # si ya encontré antes no hago nada.
        return x['bestmatch_fuzzywuzzy2']
    if x['bestmatch_fuzzywuzzy2'] == '':
        return x['bestmatch_fuzzywuzzy2']
    
    df_calle = df[df['calle'] == x['bestmatch_fuzzywuzzy2']]
    df_cp = df_calle[df_calle['cpa'].str[0:1] == x['letra']]
    return df_cp['cpa'].tolist()

In [32]:
df_lemon['cpa-v2'] = df_lemon.apply(lambda x: search_by_street_and_cp(x), axis=1)

In [33]:
## Si no tiene num -> le asigno el 1ero
## Si tiene num -> busco el más cercano
def get_cpa(x):
    if len(x['cpa'])==1: # si ya encontré antes no hago nada.
        return x['cpa']
    if x['bestmatch_fuzzywuzzy2'] == '':
        return x['cpa']
    
    if x['num'] == 0:
        return [x['cpa-v2'][0]]
    else:
        df_cp = df[df['calle'] == x['bestmatch_fuzzywuzzy2']]
        df_cp = df_cp[(df_cp['cp'] == x['cp']) | (df_cp['alt-cp'] == x['cp']) | (df_cp['cpa'].str[0:1] == x['letra'])]
        dist = (df_cp['desde'] - x['num']).abs()
        return [df_cp.loc[dist.idxmin()]['cpa']]
    
df_lemon['cpa'] = df_lemon.apply(lambda x: get_cpa(x), axis=1)


In [34]:
## 2d0 casos que aun no tengo nombre de calle similar
df_withou_cpa = df_lemon[df_lemon['cpa'].apply(lambda x: len(x) != 1)]
df_lemon_new2 = df_withou_cpa[df_withou_cpa['bestmatch_fuzzywuzzy2']=='']
print('total sin CPA ' + str(df_withou_cpa.shape))

print('tiene num ' + str(df_lemon_new2[df_lemon_new2['num'].notnull()].shape))
print('no tiene num ' + str(df_lemon_new2[df_lemon_new2['num'].isnull()].shape))


total sin CPA (192, 10)
tiene num (192, 10)
no tiene num (0, 10)


### Exporto a csv

In [35]:
df_lemon.to_csv('resultados-cpas-ultimo.csv', sep=',')