In [1]:
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests
import re
import pandas as pd
import csv
import difflib
pd.options.display.max_colwidth = 1000

### csv con los datos scrappeados de https://codigo-postal.co/argentina/

In [2]:
df = pd.read_csv('codigos-postales-arg.csv')
df = df.drop('index', axis=1)
df = df.drop('Unnamed: 0', axis=1)
df.tail(1)

Unnamed: 0,calle,desde,hasta,par,cp,cpa,provincia,localidad
2515944,Calle Simona de Lopez,2,100,True,4124,T4127BHB,https://codigo-postal.co/argentina/tucuman/,https://codigo-postal.co/argentina/tucuman/villa-trancas/


#### limpio los datos

In [3]:
def removeURL(x, num):
    prov = x.split('/')
    if (len(prov)==1):
        return prov[0]
    return prov[num]

df['provincia'] = df['provincia'].apply(lambda x: removeURL(x, 4))
df['localidad'] = df['localidad'].apply(lambda x: removeURL(x, 5))
df['localidad'] = df['localidad'].str.replace('-', ' ')
df['localidad'] = df['localidad'].str.lower()
df['calle'] = df['calle'].str.lower()

df.tail(1)

Unnamed: 0,calle,desde,hasta,par,cp,cpa,provincia,localidad
2515944,calle simona de lopez,2,100,True,4124,T4127BHB,tucuman,villa trancas


### csv con users de Lemon

In [4]:
df_lemon = pd.read_csv('lemoncash_ar_Accounts.csv')
df_lemon.columns = ['calle', 'num', 'localidad', 'prov', 'cp']
df_lemon['calle'] = df_lemon['calle'].str.lower()
df_lemon['par'] = (df_lemon['num'] % 2) == 0
df_lemon.head(1)

Unnamed: 0,calle,num,localidad,prov,cp,par
0,olleros,1800.0,Palermo,Buenos Aires,1426,True


#### normalizo todo para sacar tildes y ñ

In [5]:
def normalize(s):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ñ", "n")
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s

df_lemon['calle'] = df_lemon['calle'].apply(lambda x: normalize(str(x)))
df['provincia'] = df['provincia'].apply(lambda x: normalize(str(x)))
df['localidad'] = df['localidad'].apply(lambda x: normalize(str(x)))
df['calle'] = df['calle'].apply(lambda x: normalize(str(x)))

#### Busco la letra de la prov (primer caracter del CPA)

In [6]:
def province_letter(y):
    x = y['prov']
    if x == 'Salta':
        return 'A'
    elif x == 'Provincia de Buenos Aires':
        return 'B'
    elif x == 'Buenos Aires':
        return 'C'
    elif x == 'San Luis':
        return 'D'
    elif x == 'Entre Ríos':
        return 'E'
    elif x == 'La Rioja':
        return 'F'
    elif x == 'Santiago del Estero':
        return 'G'
    elif x == 'Chaco':
        return 'H'
    elif x == 'San Juan':
        return 'J'
    elif x == 'Catamarca':
        return 'K'
    elif x == 'La Pampa':
        return 'L'
    elif x == 'Mendoza':
        return 'M'
    elif x == 'Misiones':
        return 'N'
    elif x == 'Formosa':
        return 'P'
    elif x == 'Neuquén':
        return 'Q'
    elif x == 'Río Negro':
        return 'R'
    elif x == 'Santa Fe':
        return 'S'
    elif x == 'Tucumán':
        return 'T'
    elif x == 'Chubut':
        return 'U'
    elif x == 'Tierra del Fuego':
        return 'V'
    elif x == 'Corrientes':
        return 'W'
    elif x == 'Córdoba':
        return 'X'
    elif x == 'Jujuy':
        return 'Y'
    elif x == 'Santa Cruz':
        return 'Z'
    return ''

In [7]:
df_lemon['letra'] = df_lemon.apply(lambda x: province_letter(x), axis=1)

In [8]:
df['alt-cp'] = df['cpa'].str[1:5]
df['alt-cp'] = pd.to_numeric(df['alt-cp'])

### Funciones para buscar la mejor coincidencia de nombre de calle


In [9]:
### Busca la calle más parecida dentro del subconjunto con el mismo CP
def closest_match_difflib(x):
    try:
        df_new = df.loc[df['cp'] == x['cp']]
        bestmatch = difflib.get_close_matches(str(x['calle']), df_new['calle'], 1, 0.4)
        if len(bestmatch)>0:
            return bestmatch[0]
        return ''
    except:
        return ''

In [10]:
## Idem anterior pero con otro algoritmo
def closest_match_fuzzywuzzy(x):
    try:
        df_new = df.loc[x['letra']==df['cpa'].str[0:1]]
        df_new = df.loc[df['cp'] == x['cp']]
        bestmatch = process.extract(str(x['calle']), df_new['calle'], scorer=fuzz.token_sort_ratio, limit=1)
        if len(bestmatch)>0 and bestmatch[0][1]>60:
            return bestmatch[0][0]
        return ''
    except:
        return ''

## Idem anterior pero con otro algoritmo
def closest_match_fuzzywuzzy2(x):
    try:
        df_new = df.loc[x['letra']==df['cpa'].str[0:1]]
        df_new = df.loc[df['cp'] == x['cp']]
        bestmatch = process.extract(str(x['calle']), df_new['calle'], scorer=fuzz.token_set_ratio, limit=1)
        if len(bestmatch)>0 and bestmatch[0][1]>60:
            return bestmatch[0][0]
        return ''
    except:
        return ''


In [11]:
df_lemon['bestmatch_difflib'] = df_lemon.apply(lambda x: closest_match_difflib(x), axis=1)
df_lemon['bestmatch_fuzzywuzzy'] = df_lemon.apply(lambda x: closest_match_fuzzywuzzy(x), axis=1)
df_lemon['bestmatch_fuzzywuzzy2'] = df_lemon.apply(lambda x: closest_match_fuzzywuzzy2(x), axis=1)
df_lemon.head()

Unnamed: 0,calle,num,localidad,prov,cp,par,letra,bestmatch_difflib,bestmatch_fuzzywuzzy,bestmatch_fuzzywuzzy2
0,olleros,1800.0,Palermo,Buenos Aires,1426,True,C,calle olleros,calle olleros,calle olleros
1,jose cortejarena,3383.0,La Reja,Provincia de Buenos Aires,1437,False,B,calle jose a cortejarena,calle jose a cortejarena,calle jose a cortejarena
2,calle 48,4175.0,Necochea,Provincia de Buenos Aires,7630,False,B,calle 48,calle 48,calle 48
3,americo vespucio,5220.0,Corrientes,Corrientes,3400,True,W,calle americo vespucio,calle americo vespucio,calle americo vespucio
4,unitan,553.0,Desvio Tirol,Chaco,3505,False,H,,,calle unitan


### Funcion para buscar el CPA

In [12]:
def find_CPA(x):
    if x['bestmatch_fuzzywuzzy2'] == '':
        return ''
    df_calle = df[df['calle'] == x['bestmatch_fuzzywuzzy2']]
    df_cp = df_calle[(df_calle['cp'] == x['cp']) | (df_calle['alt-cp'] == x['cp'])]
    df_num = df_cp[(df_cp["desde"]<=x['num']) & (df_cp["hasta"]>=x['num'])]
    df_par = df_num[df_num['par']==x['par']]
    if len(df_par) != 0:
        return df_par['cpa'].tolist()
    
    df_cp = df_calle[df_calle['cpa'].str[0:1] == x['letra']]
    df_num = df_cp[(df_cp["desde"]<=x['num']) & (df_cp["hasta"]>=x['num'])]
    df_par = df_num[df_num['par']==x['par']]
    return df_par['cpa'].tolist()

In [13]:
pd.options.mode.chained_assignment = None
df_lemon['cpa'] = df_lemon.apply(lambda x: find_CPA(x), axis=1)

### Busco potenciales CPAs

In [14]:
def find_CPAs(x):
    if len(x['cpa'])>0:
        return x['cpa']
    df_new = df[(df['cp'] == x['cp']) | (df['alt-cp'] == x['cp'])]
    df_new = df_new[df_new['cpa'].str[0:1] == x['letra']]
    return df_new['cpa'].tolist()

In [15]:
pd.options.mode.chained_assignment = None
df_lemon['cpa'] = df_lemon.apply(lambda x: find_CPAs(x), axis=1)

In [16]:
def to_unique_list(x):
    return list(dict.fromkeys(x['cpa']))   

In [17]:
df_lemon['cpa'] = df_lemon.apply(lambda x: to_unique_list(x), axis=1)

### Exporto a csv

In [18]:
df_lemon.to_csv('resultados-cpas.csv', sep=',')