In [2]:
import pandas as pd
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

dfa = pd.read_csv('data/natal_apartments.csv')
dfh = pd.read_csv('data/natal_houses.csv')

<h3>Labeling data

In [3]:
#True = 1, False = 0
dfa.replace({False: 0, True: 1}, inplace=True)
dfh.replace({False: 0, True: 1}, inplace=True)

<h5>Geospatial features

In [4]:
geo = pd.read_csv('data/geographies_bairros_natal.csv')
geo.drop('Unnamed: 0', axis=1, inplace= True)

In [5]:
geo['tag'].value_counts()

amenity_pharmacy            35
leisure_*                   35
amenity_restaurant          35
shop_supermarket            35
amenity_clinic              35
amenity_school              35
tourism_*                   35
amenity_police              35
amenity_hospital            35
amenity_place_of_worship    35
shop_mall                   35
amenity_fast_food           35
Name: tag, dtype: int64

In [6]:
geo.rename(index=str, columns={"bairro": "district"}, inplace=True)
geo.head(10)

Unnamed: 0,tag,district,count,REG_ADM,area,density
0,amenity_clinic,Alecrim,0,Leste,2.812663,0.0
1,amenity_clinic,Areia Preta,0,Leste,0.262493,0.0
2,amenity_clinic,Barro Vermelho,0,Leste,0.77346,0.0
3,amenity_clinic,Bom Pastor,0,Oeste,2.823754,0.0
4,amenity_clinic,Candelária,0,Sul,6.21309,0.0
5,amenity_clinic,Capim Macio,3,Sul,3.536323,0.848339
6,amenity_clinic,Cidade Alta,1,Leste,0.949833,1.052817
7,amenity_clinic,Cidade Nova,0,Oeste,2.13874,0.0
8,amenity_clinic,Cidade da Esperança,0,Oeste,1.492125,0.0
9,amenity_clinic,Dix-Sept Rosado,0,Oeste,0.894543,0.0


In [7]:
#am_worship = geo[geo['tag'] == 'amenity_place_of_worship']
#am_worship.drop(['tag'], axis = 1, inplace = True)
#am_worship.head(10)

In [8]:
#Map the district encoding in a dictionary for later use
#am_worship['district'] = le.fit_transform(am_worship['district'])
le.fit_transform(geo['district'])
district_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
district_name_mapping

{'Alecrim': 0,
 'Areia Preta': 1,
 'Barro Vermelho': 2,
 'Bom Pastor': 3,
 'Candelária': 4,
 'Capim Macio': 5,
 'Cidade Alta': 6,
 'Cidade Nova': 7,
 'Cidade da Esperança': 8,
 'Dix-Sept Rosado': 9,
 'Felipe Camarão': 10,
 'Guarapes': 11,
 'Igapó': 12,
 'Lagoa Azul': 13,
 'Lagoa Nova': 14,
 'Lagoa Seca': 15,
 'Mãe Luiza': 16,
 'N. S. Apresentação': 17,
 'N. Sra. do Nazaré': 18,
 'Neópolis': 19,
 'Nova Descoberta': 20,
 'Pajuçara': 21,
 'Parque das Dunas': 22,
 'Petropólis': 23,
 'Pitimbu': 24,
 'Planalto': 25,
 'Ponta Negra': 26,
 'Potengi': 27,
 'Praia do Meio': 28,
 'Quintas': 29,
 'Redinha': 30,
 'Ribeira': 31,
 'Rocas': 32,
 'Santos Reis': 33,
 'Tirol': 34}

In [9]:
#Label encode district according to district_name_mapping (Generates a ValueError)
#le.transform(dfa['district'])

<p>We've got a ValueError because some of the districts in our apartment df are encoded in utf-8, while our OSM data is latin.</p>
Also, two of the districts are not found in OSM, so we drop them.

In [10]:
#Removing / Resolving invalid districts
#Rename Candelaria, Igapo, Neopolis, Pajucara, Petropolis, da Esperanca, de Nazare, sept Rosado
dfa['district'] = dfa['district'].str.replace('Candelaria', 'Candelária')
dfa['district'] = dfa['district'].str.replace('Igapo', 'Igapó')
dfa['district'] = dfa['district'].str.replace('Neopolis', 'Neópolis')
dfa['district'] = dfa['district'].str.replace('Pajucara', 'Pajuçara')
dfa['district'] = dfa['district'].str.replace('Petropolis', 'Petropólis')
dfa['district'] = dfa['district'].str.replace('da Esperanca', 'Cidade da Esperança')
dfa['district'] = dfa['district'].str.replace('de Nazare', 'N. Sra. do Nazaré')
dfa['district'] = dfa['district'].str.replace('sept Rosado', 'Dix-Sept Rosado')

#Dropping PARNAMIRIM, Morro Branco
dfa.drop(dfa[dfa['district'] == 'PARNAMIRIM'].index, inplace = True)
dfa.drop(dfa[dfa['district'] == 'Morro Branco'].index, inplace = True)

In [11]:
dfa['district_num'] = le.transform(dfa['district'])

Repeat the same steps for the houses df

In [12]:
#Label encode district according to district_name_mapping (Generates a ValueError)
#le.transform(dfh['district'])

In [13]:
#Removing / Resolving invalid districts
dfh['district'] = dfh['district'].str.replace('Candelaria', 'Candelária')
dfh['district'] = dfh['district'].str.replace('Igapo', 'Igapó')
dfh['district'] = dfh['district'].str.replace('Neopolis', 'Neópolis')
dfh['district'] = dfh['district'].str.replace('Pajucara', 'Pajuçara')
dfh['district'] = dfh['district'].str.replace('Petropolis', 'Petropólis')
dfh['district'] = dfh['district'].str.replace('Cidade da Esperanca', 'Cidade da Esperança')
dfh['district'] = dfh['district'].str.replace('de Nazare', 'N. Sra. do Nazaré')
dfh['district'] = dfh['district'].str.replace('sept Rosado', 'Dix-Sept Rosado')
dfh['district'] = dfh['district'].str.replace('Nossa Senhora da Apresentacao', 'N. S. Apresentação')
dfh['district'] = dfh['district'].str.replace('POTILANDIA', 'Lagoa Nova')


dfh.drop(dfh[dfh['district'] == 'PARNAMIRIM'].index, inplace = True)
dfh.drop(dfh[dfh['district'] == 'Cidade Jardim'].index, inplace = True)
dfh.drop(dfh[dfh['district'] == 'Cidade Verde'].index, inplace = True)
dfh.drop(dfh[dfh['district'] == 'Morro Branco'].index, inplace = True)
dfh.drop(dfh[dfh['district'] == 'San Vale'].index, inplace = True)

In [14]:
dfh['district_num'] = le.transform(dfh['district'])

<h3>Feature Engineering

In [15]:
geo['tag'].value_counts()

amenity_pharmacy            35
leisure_*                   35
amenity_restaurant          35
shop_supermarket            35
amenity_clinic              35
amenity_school              35
tourism_*                   35
amenity_police              35
amenity_hospital            35
amenity_place_of_worship    35
shop_mall                   35
amenity_fast_food           35
Name: tag, dtype: int64

In [16]:
#Disable warning
pd.options.mode.chained_assignment = None

def slicendice(df_target, colname):
    df_target.drop(['tag'], axis = 1, inplace = True)
    df_target.rename(index=str, columns={"count": colname}, inplace=True)
    df_target['district_num'] = le.fit_transform(df_target['district'])

#1st merge
am_worship = geo[geo['tag'] == 'amenity_place_of_worship']
am_pharma = geo[geo['tag'] == 'amenity_pharmacy']
slicendice(am_worship, "am_worship")
slicendice(am_pharma, "am_pharma")
worship_pharma = pd.merge(am_worship, am_pharma, on='district_num', suffixes=('_worship', '_pharma'))

#2nd merge
tourism = geo[geo['tag'] == 'tourism_*']
leisure = geo[geo['tag'] == 'leisure_*']
slicendice(tourism,"am_tourism")
slicendice(leisure,"am_leisure")
tourism_leisure = pd.merge(tourism, leisure, on='district_num', suffixes=('_tourism', '_leisure'))

#3rd merge
clinic = geo[geo['tag'] == 'amenity_clinic']
fast_food = geo[geo['tag'] == 'amenity_fast_food']
slicendice(clinic,"am_clinic")
slicendice(fast_food,"am_fast_food")
clinic_fast_food = pd.merge(clinic, fast_food, on='district_num', suffixes=('_clinic', '_fast_food'))

#4th merge
police = geo[geo['tag'] == 'amenity_police']
restaurant = geo[geo['tag'] == 'amenity_restaurant']
slicendice(police,"am_police")
slicendice(restaurant,"am_restaurant")
police_restaurant = pd.merge(police, restaurant, on='district_num', suffixes=('_police', '_restaurant'))

#5th merge
shop_mall = geo[geo['tag'] == 'shop_mall']
shop_supermarket = geo[geo['tag'] == 'shop_supermarket']
slicendice(shop_mall,"am_shop_mall")
slicendice(shop_supermarket,"am_shop_supermarket")
shop_mall_shop_supermarket = pd.merge(shop_mall, shop_supermarket, on='district_num', suffixes=('_shop_mall', '_shop_supermarket'))

#6th merge
school = geo[geo['tag'] == 'amenity_school']
hospital = geo[geo['tag'] == 'amenity_hospital']
slicendice(school,"am_school")
slicendice(hospital,"am_hospital")
school_hospital = pd.merge(school, hospital, on='district_num', suffixes=('_school', '_hospital'))

In [17]:
m1 = pd.merge(worship_pharma, tourism_leisure, on='district_num')
m2 = pd.merge(clinic_fast_food, police_restaurant, on='district_num')
m3 = pd.merge(shop_mall_shop_supermarket, school_hospital, on='district_num')
m4 = pd.merge(m1, m2, on = 'district_num')
m5 = pd.merge(m4, m3, on = 'district_num')

In [18]:
fe_apartments = pd.merge(dfa, m5, on = 'district_num')

Drop repeated columns

In [19]:
fe_apartments.drop(['REG_ADM_pharma','REG_ADM_tourism', 'REG_ADM_leisure', 'REG_ADM_clinic', 'REG_ADM_fast_food',
         'REG_ADM_police', 'REG_ADM_restaurant', 'REG_ADM_shop_mall', 'REG_ADM_shop_supermarket', 
         'REG_ADM_school', 'REG_ADM_hospital'], axis = 1, inplace = True)
fe_apartments.rename(index=str, columns={"REG_ADM_worship":"region"}, inplace=True)

In [20]:
fe_apartments.drop(['district_worship','district_leisure','district_clinic','district_fast_food','district_police', 
                    'district_restaurant','district_shop_mall', 'district_shop_supermarket',
                    'district_school', 'district_hospital', 'district_tourism','district_pharma'], axis = 1, inplace = True)

In [21]:
fe_apartments.columns

Index(['re_type', 'adv_bed', 'adv_bath', 'adv_park', 'pool', 'elevator', 'bbq',
       'balcony', 'leisure', 'adv_size', 'district', 'adv_sale',
       'district_num', 'am_worship', 'region', 'area_worship',
       'density_worship', 'am_pharma', 'area_pharma', 'density_pharma',
       'am_tourism', 'area_tourism', 'density_tourism', 'am_leisure',
       'area_leisure', 'density_leisure', 'am_clinic', 'area_clinic',
       'density_clinic', 'am_fast_food', 'area_fast_food', 'density_fast_food',
       'am_police', 'area_police', 'density_police', 'am_restaurant',
       'area_restaurant', 'density_restaurant', 'am_shop_mall',
       'area_shop_mall', 'density_shop_mall', 'am_shop_supermarket',
       'area_shop_supermarket', 'density_shop_supermarket', 'am_school',
       'area_school', 'density_school', 'am_hospital', 'area_hospital',
       'density_hospital'],
      dtype='object')

In [22]:
fe_houses = pd.merge(dfh, m5, on = 'district_num')
fe_houses.drop(['REG_ADM_pharma','REG_ADM_tourism', 'REG_ADM_leisure', 'REG_ADM_clinic', 'REG_ADM_fast_food',
         'REG_ADM_police', 'REG_ADM_restaurant', 'REG_ADM_shop_mall', 'REG_ADM_shop_supermarket', 
         'REG_ADM_school', 'REG_ADM_hospital'], axis = 1, inplace = True)
fe_houses.rename(index=str, columns={"REG_ADM_worship":"region"}, inplace=True)
fe_houses.drop(['district_worship','district_leisure','district_clinic','district_fast_food','district_police', 
                    'district_restaurant','district_shop_mall', 'district_shop_supermarket',
                    'district_school', 'district_hospital', 'district_tourism','district_pharma' ], axis = 1, inplace = True)

In [23]:
fe_apartments.to_csv('data/NatalBR_Apartments.csv')
fe_houses.to_csv('data/NatalBR_Housing.csv')