# Location Dimension ETL

## Part 1- Raw Data Processing

**UNIX systems only**

This part uses pypostal, which are python bindings for the libpostal C library.

"Libpostal is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. The goal of this project is to understand location-based strings in every language, everywhere."

[Follow this instructions to install pypostal](https://github.com/openvenues/pypostal#installation).

### Library and Function Imports

In [1]:
# Import Libs

import re
import pandas as pd
from postal.parser import parse_address
import string
import numpy as np

In [2]:
# Load Functions

def street_parser(s):
    
    s_final = ''
    
    sch = s[:int(len(s)/2)]

    first_digit = re.search(r"\d", sch)
    
    try:
        s = s[:int(first_digit.start() + len(s)/2)]
        
    except AttributeError:
        pass

    s = s.split(' ')
    s = [x.replace(' ', '') for x in s if x != '']

    for ss in s:
        s_final += '{} '.format(ss)
        
    return s_final[:-1]

def is_int(n):
    
    try:
        int(n)
        return True
    
    except ValueError:
        return False
        

def get_street(st, street_str=''):
    
    forbiden = ['DENOMINADO', 'Nº', 'NºS', 'NÚMEROS', 'N.º']
    
    street = parse_address(st)[0][0]
    
    if len(street) <= len(st)/2:
        
        street = street_parser(st)
    
    street = street.split(' ')
    
    for i, s in enumerate(street):
        if s not in street_str:
            if s in forbiden or (i > 3 and any(is_int(si) for si in s)):
                break

            street_str += s + ' '
            
    street_str.translate(str.maketrans('', '', string.punctuation))
            
    return street_str[:-1].upper()

def validate_zip(sample, expr=r"(\b\d{4}-\d{3}\b)"):
    
    zipCode = re.compile(expr)

    try:
        if zipCode.match(sample):
            return sample
        else:
            return None
    except TypeError:
        return None
    
def delete_null_rows(df,columns):
    """Removes all records with any empty cells from input DataFrame"""
    processed_df = df[columns].copy()
    total_rows = processed_df.shape[0]
    delete_rows = []
    
    if processed_df.isnull().values.any(): # if there are any null values in DataFrame, process DataFrame
        for index, row in enumerate(processed_df.itertuples(), start = 0):
            if (pd.Series(row).isnull().values.any()): # if row has any null value
                delete_rows.append(index) # add row index to delete list
    
    final_df = df.copy()
    final_df.drop(df.index[delete_rows], inplace = True) # delete rows fr
    final_df = final_df.reset_index().drop('index', axis = 1)
    print("DataFrame contains {} rows. Deleted {} rows ({}% of total rows)".format(
        len(final_df), len(delete_rows), round(len(delete_rows)*100/total_rows, 2)))
    return final_df

def check_zeros(ext, lenght=3):
    
    ext = str(ext)
    
    if len(ext) != lenght:
        
        return '0' * (lenght - len(ext)) + str(ext)
    else:
        return ext
    
def coastal_area(boolean):
    if boolean == 'Sim':
        return 'Coastal Area'
    elif boolean == 'Não':
        return 'Not Coastal Area'
    else:
        return None
    
def map_duplicates(streets, zip_codes, ids):

    temps = []
    dup_id = []
    main_id = []

    for i, entry in enumerate(ids):


        flag = True
        for temp in temps:

            if (streets[i].issubset(temp[0]) or streets[i] == temp[0])  and zip_codes[i][:4] == temp[1][:4]:

                dup_id.append(entry)
                main_id.append(temp[2])

                flag = False
                break

        if flag is True:

            temps.append([streets[i], zip_codes[i], entry])
            dup_id.append(entry)
            main_id.append(entry)
            
    return(dup_id, main_id)

### Load Data

In [3]:
#Import Data

df = pd.read_csv("../data/listings_al.csv", low_memory=False)
df_cp = pd.read_csv("../data/codigos_postais.csv", low_memory=False)
df_c = pd.read_csv("../data/concelhos.csv", low_memory=False)
df_d = pd.read_csv("../data/distritos.csv", low_memory=False)
df_d = df_d[df_d['nome_distrito'] == 'Lisboa']

### Merge Data

In [4]:
# Merge CTT Data

df_cd = pd.merge(df_c, df_d, on='cod_distrito')
df_ctt = pd.merge(df_cp, df_cd, on=['cod_concelho', 'cod_distrito'])

df_ctt['cp'] = [str(df_ctt['num_cod_postal'][i]) + check_zeros(df_ctt['ext_cod_postal'][i]) 
                for i in range(len(df_ctt))]

delete = ['cod_distrito', 'cod_concelho', 'cod_localidade', 'cod_arteria', 'prep1', 'titulo_arteria', 
          'prep2', 'local_arteria', 'troco', 'porta', 'cliente', 'desig_postal', 'nome_arteria', 
          'ext_cod_postal', 'num_cod_postal']
df_ctt.drop(delete, axis=1, inplace=True)

In [5]:
df_ctt.head()

Unnamed: 0,nome_localidade,tipo_arteria,nome_concelho,nome_distrito,cp
0,Abrigada,Rua,Alenquer,Lisboa,2580010
1,Abrigada,Rua,Alenquer,Lisboa,2580001
2,Abrigada,Rua,Alenquer,Lisboa,2580011
3,Abrigada,Estrada Nacional,Alenquer,Lisboa,2580119
4,Abrigada,Rua,Alenquer,Lisboa,2580115


In [6]:
#Missing Data

df_ctt.isnull().sum()

nome_localidade       0
tipo_arteria       1618
nome_concelho         0
nome_distrito         0
cp                    0
dtype: int64

In [7]:
# Parse Zip Codes

df['CodigoPostal'] = [validate_zip(sample) for sample in df['CodigoPostal']]

# Delete Records with Invalid Zip Codes

df = delete_null_rows(df, ['CodigoPostal'])

DataFrame contains 17147 rows. Deleted 21 rows (0.12% of total rows)


In [8]:
# Prepare Final Merge
        
df['cp'] = [cp.replace('-', '')  for cp in df['CodigoPostal']]

before = len(df)

# Merge All Data

df = pd.merge(df, df_ctt, on='cp', how='inner')
df.drop_duplicates(subset=['id'], inplace=True)
print('{} rows after merge: {} rows deleted'.format(len(df), before-len(df)))

16647 rows after merge: 500 rows deleted


Some of the properties, are not in the Lisbon district so they where eliminated.

### Parse Dataframe

In [9]:
# Build Location DataFrame

df_loc = pd.DataFrame()

df_loc['location_id'] = [value for value in df['id']]
df_loc['street'] = [get_street(value) for value in df['Endereco']]
df_loc['street_type'] = [value for value in df['tipo_arteria']]
df_loc['zip_code'] = [value for value in df['CodigoPostal']]
df_loc['parish'] = [value for value in df['nome_localidade']]
df_loc['county'] = [value for value in df['nome_concelho']]
df_loc['coastal_area'] = [coastal_area(value) for value in df['FreguesiasCosteiras']]

# Sorting values for FK mapping

ind = df_loc.street.str.len().sort_values(ascending=False).index
df_loc = df_loc.reindex(ind)
df_loc.reset_index(drop=True, inplace=True)

In [14]:
df_loc.head()

Unnamed: 0,location_id,street,street_type,zip_code,parish,county,coastal_area
0,12194911,ROTUNDA JOÃO PAULO II E AVENIDA REI HUMBERTO D...,Rua,2750-641,Cascais,Cascais,Coastal Area
1,40962519,"URBANIZAÇÃO VALE DA AZENHA, RUA DOS DESCOBRIME...",Urbanização,2560-510,Santa Cruz,Torres Vedras,Coastal Area
2,39414401,TRAVESSA CAMINHO DO PINHAL CONDOMÍNIO PARQUE A...,,2560-051,Praia do Navio,Torres Vedras,Coastal Area
3,16883483,AVENIDA COMISSÃO DE MELHORAMENTOS MIL NOVECENT...,,2705-001,Azoia,Sintra,Coastal Area
4,12978864,RUA REI HUMBERTO II DE ITÁLIA CONDOMÍNIO CASAS...,Rua,2750-641,Cascais,Cascais,Coastal Area


Right now location as a one to one relationship with listings, since in reality they have a one to many relationship, this means we have duplicates. The last processing needed is to remove all duplicates, while mapping a fk to a respective listing.

In [11]:
# Map Duplicates

streets = [set(street.split(' ')) for street in df_loc['street']]
zip_codes = df_loc['zip_code']
ids = df_loc['location_id']

(dup_id, main_id) = map_duplicates(streets, zip_codes, ids)

# Remove Duplicates

loc = df_loc[df_loc.location_id.isin(set(main_id))]
loc.reset_index(drop=True, inplace=True)
loc.head()

Unnamed: 0,location_id,street,street_type,zip_code,parish,county,coastal_area
0,12194911,ROTUNDA JOÃO PAULO II E AVENIDA REI HUMBERTO D...,Rua,2750-641,Cascais,Cascais,Coastal Area
1,40962519,"URBANIZAÇÃO VALE DA AZENHA, RUA DOS DESCOBRIME...",Urbanização,2560-510,Santa Cruz,Torres Vedras,Coastal Area
2,39414401,TRAVESSA CAMINHO DO PINHAL CONDOMÍNIO PARQUE A...,,2560-051,Praia do Navio,Torres Vedras,Coastal Area
3,16883483,AVENIDA COMISSÃO DE MELHORAMENTOS MIL NOVECENT...,,2705-001,Azoia,Sintra,Coastal Area
4,12978864,RUA REI HUMBERTO II DE ITÁLIA CONDOMÍNIO CASAS...,Rua,2750-641,Cascais,Cascais,Coastal Area


In [12]:
# Build index dataframe
index = [i for i in loc.index]
location_id = [location for location in loc['location_id']]
columns = ['fk', 'main_id']

ind = pd.DataFrame(data=[index, location_id]).T
ind.columns = columns

# Build FK dataframe

columns = ['listings_id', 'main_id']
fk_map = pd.DataFrame(data=[dup_id, main_id]).T
fk_map.columns = columns

# Merge dataframes

fk_map = pd.merge(ind, fk_map, on='main_id', how='inner')
fk_map.drop(['main_id'], axis=1, inplace=True)
fk_map.head()

Unnamed: 0,fk,listings_id
0,0,12194911
1,1,40962519
2,2,39414401
3,3,16883483
4,4,12978864


In [13]:
loc.drop(['location_id'], axis=1, inplace=True)
loc.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,street,street_type,zip_code,parish,county,coastal_area
0,ROTUNDA JOÃO PAULO II E AVENIDA REI HUMBERTO D...,Rua,2750-641,Cascais,Cascais,Coastal Area
1,"URBANIZAÇÃO VALE DA AZENHA, RUA DOS DESCOBRIME...",Urbanização,2560-510,Santa Cruz,Torres Vedras,Coastal Area
2,TRAVESSA CAMINHO DO PINHAL CONDOMÍNIO PARQUE A...,,2560-051,Praia do Navio,Torres Vedras,Coastal Area
3,AVENIDA COMISSÃO DE MELHORAMENTOS MIL NOVECENT...,,2705-001,Azoia,Sintra,Coastal Area
4,RUA REI HUMBERTO II DE ITÁLIA CONDOMÍNIO CASAS...,Rua,2750-641,Cascais,Cascais,Coastal Area


In [15]:
# Last check for null values before exportation.

print(loc.isnull().sum())
print(fk_map.isnull().sum())

street            0
street_type     180
zip_code          0
parish            0
county            0
coastal_area      0
dtype: int64
fk             0
listings_id    0
dtype: int64


In [20]:
# Export datasets into csv

loc.to_csv('../data/processed_dt/location.csv', index=False)
fk_map.to_csv('../data/processed_dt/location_fk.csv', index=False)

## Part 2 - Injecting Database 

In [21]:
# Import Libs

import pandas as pd

In [22]:
df = pd.read_csv('../data/processed_dt/location.csv', low_memory=False)

In [23]:
df.head()

Unnamed: 0,street,street_type,zip_code,parish,county,coastal_area
0,ROTUNDA JOÃO PAULO II E AVENIDA REI HUMBERTO D...,Rua,2750-641,Cascais,Cascais,Coastal Area
1,"URBANIZAÇÃO VALE DA AZENHA, RUA DOS DESCOBRIME...",Urbanização,2560-510,Santa Cruz,Torres Vedras,Coastal Area
2,TRAVESSA CAMINHO DO PINHAL CONDOMÍNIO PARQUE A...,,2560-051,Praia do Navio,Torres Vedras,Coastal Area
3,AVENIDA COMISSÃO DE MELHORAMENTOS MIL NOVECENT...,,2705-001,Azoia,Sintra,Coastal Area
4,RUA REI HUMBERTO II DE ITÁLIA CONDOMÍNIO CASAS...,Rua,2750-641,Cascais,Cascais,Coastal Area
