In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import importlib
import clean_functions as cl
import geopy
import utm
%pylab inline
pd.set_option('display.max_column',None)

Populating the interactive namespace from numpy and matplotlib


## Data gathering
Data gathering:
- Activities files (from 2015 - 2019, Ayuntamiento Madrid)
- Madrid population database (1st january 2019, Ayuntamiento Madrid)
- Madrid floating population (16-22 april 2018, Private source)

In [2]:
os.getcwd()

'/home/dsc/Python_notebooks/TFM/TBV/TBV_v1'

### Activities file: 
Data from madrid.es open data portal: https://datos.madrid.es/portal/site/egob/menuitem.c05c1f754a33a9fbe4b2e4b284f1a5a0/?vgnextoid=23160329ff639410VgnVCM2000000c205a0aRCRD&vgnextchannel=374512b9ace9f310VgnVCM100000171f5a0aRCRD&vgnextfmt=default   
We load the file into memory as a pandas dataframe. Files are uncompressed.   
Careful with separators: some cleaning through Command Line has beeen necessary to remove "ambigous separators": clean versions of the files.   
Encoding is Latin9 (although in documentation UTF8 is mentioned).      
After checking all files available in Madrid web portal (locals, licences and activities (== epigrafes in Spanish), I will work over epigrafes file that contains all the locals and status.   
Since the study is being done on a yearly basis and 2019 data is only available till Sep'19, I will use the September versions for all the files   
I decided not use 2014 file since doesn'thave the same fields as the others and goes further in time.  

In [3]:
# 2019 data. A clean version is needed to correct wrong separators via command line
df_epi19 = pd.read_csv('Data/censolocales/OPEN DATA Locales-Epigrafes201909_clean.csv',sep=';',encoding='latin9')
# 2018 data. A clean version is needed to correct wrong separators via command line
df_epi18 = pd.read_csv('Data/censolocales/OPEN DATA Locales-Epigrafes201809_clean.csv',sep=';',encoding='latin9')
# 2017 data. September file was corrupted (many NaN). I will use Nov. file instead
df_epi17 = pd.read_csv('Data/censolocales/OPEN DATA Locales-Epigrafes201711_clean.csv',sep=';',encoding='latin9')
# 2016 data. This file is ok, no clean version is needed.
df_epi16 = pd.read_csv('Data/censolocales/OPEN DATA Locales-Epigrafes201609.csv',sep=';',encoding='latin9')
# 2015 data. This file is ok, no clean version is needed.
df_epi15 = pd.read_csv('Data/censolocales/OPEN DATA Locales-Epigrafes201509.csv',sep=';',encoding='latin9')

  interactivity=interactivity, compiler=compiler, result=result)


Each new year file contains the locals of previous years and the new that year. All of the have the same columns

In [4]:
df_epi19.shape, df_epi18.shape, df_epi17.shape, df_epi16.shape, df_epi15.shape

((163355, 46), (162467, 46), (161097, 46), (159604, 46), (157206, 46))

In [5]:
df_epi19.columns

Index(['id_local', 'id_distrito_local', 'desc_distrito_local',
       'id_barrio_local', 'desc_barrio_local', 'cod_barrio_local',
       'id_seccion_censal_local', 'desc_seccion_censal_local',
       'coordenada_x_local', 'coordenada_y_local', 'id_tipo_acceso_local',
       'desc_tipo_acceso_local', 'id_situacion_local', 'desc_situacion_local',
       'id_vial_edificio', 'clase_vial_edificio', 'desc_vial_edificio',
       'id_ndp_edificio', 'id_clase_ndp_edificio', 'nom_edificio',
       'num_edificio', 'cal_edificio', 'secuencial_local_PC', 'id_vial_acceso',
       'clase_vial_acceso', 'desc_vial_acceso', 'id_ndp_acceso',
       'id_clase_ndp_acceso', 'nom_acceso', 'num_acceso', 'cal_acceso',
       'coordenada_x_agrupacion', 'coordenada_y_agrup', 'id_agrupacion',
       'nombre_agrupacion', 'id_tipo_agrup', 'desc_tipo_agrup',
       'id_planta_agrupado', 'id_local_agrupado', 'rotulo', 'id_seccion',
       'desc_seccion', 'id_division', 'desc_division', 'id_epigrafe',
       'desc_epi

In [6]:
df_epi19.head(2)

Unnamed: 0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,cod_barrio_local,id_seccion_censal_local,desc_seccion_censal_local,coordenada_x_local,coordenada_y_local,id_tipo_acceso_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,id_vial_edificio,clase_vial_edificio,desc_vial_edificio,id_ndp_edificio,id_clase_ndp_edificio,nom_edificio,num_edificio,cal_edificio,secuencial_local_PC,id_vial_acceso,clase_vial_acceso,desc_vial_acceso,id_ndp_acceso,id_clase_ndp_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,id_planta_agrupado,id_local_agrupado,rotulo,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe
0,270440895,3,RETIRO,304,IBIZA,,,55,0,0,,Agrupado,,Abierto,370400,CALLE,IBIZA ...,11012388,1,NUM,8,,0,,CALLE,IBIZA ...,11012388,1,NUM,8,,44253358.0,447438951.0,99000191.0,MERCADO MUNICIPAL DE IBIZA,12.0,Mercado Municipal,PB,7.0,BAR HERMANOS BENAYAS,I,HOSTELERIA,56,SERVICIOS DE COMIDAS Y BEBIDAS,563005,BAR SIN COCINA
1,270440899,5,CHAMARTIN,502,PROSPERIDAD,,,23,44346059,44776045,,Puerta Calle,,Abierto,210200,CALLE,CORAZON DE MARIA ...,20138306,1,NUM,57,,20,,CALLE,CORAZON DE MARIA ...,20138306,1,NUM,57,,,,,,,,PB,,LA PARRILLA BERENGUER,I,HOSTELERIA,56,SERVICIOS DE COMIDAS Y BEBIDAS,561005,BAR CON COCINA


There are more rows in the files than unique locals. This is because the same local can be licenced with one or more activities

In [7]:
len(df_epi19['id_local'].unique()), len(df_epi18['id_local'].unique()), len(df_epi17['id_local'].unique()),\
len(df_epi16['id_local'].unique()),len(df_epi15['id_local'].unique())

(147344, 146607, 145752, 144920, 144048)

### Madrid Population database
Data from madrid.es open data portal: https://datos.madrid.es/portal/site/egob/menuitem.c05c1f754a33a9fbe4b2e4b284f1a5a0/?vgnextoid=1d755cde99be2410VgnVCM1000000b205a0aRCRD&vgnextchannel=374512b9ace9f310VgnVCM100000171f5a0aRCRD&vgnextfmt=default

In [8]:
#Madrid population main KPIs
df1 = pd.read_excel('Data/censopob/censo_Madrid_distrito.xlsx')
df1.head()
df1.dtypes

id_distrito_local                                                           int64
distrito                                                                   object
Población                                                                   int64
Hombre                                                                      int64
Mujeres                                                                     int64
Densidad (Habitantes / Ha.)                                               float64
Edad promedio                                                             float64
Total Hogares                                                               int64
Españoles                                                                   int64
Extranjeros                                                                 int64
Mixtos                                                                      int64
Una mujer sola de 16 a 64 años                                              int64
Un hombre solo d

### Madrid floating population
Info from Kineo. Floating population in Madrid Districts from 14-22 of april 2018

In [9]:
# Floating_population
df2 = pd.read_excel('Data/PF/PF_diaria_norm.xlsx')
df2.head()
df2.dtypes

id_distrito_local                      int64
España fuera barrio dia laboral        int64
Extranjero fuera barrio dia laboral    int64
Total fuera barrio dia laboral         int64
España fuera barrio fin semana         int64
Extranjero fuera barrio fin semana     int64
Total fuera barrio fin semana          int64
Total barrio dia laboral               int64
Total barrio fin semana                int64
Total trabajo dia laboral              int64
Total trabajo fin semana               int64
dtype: object

## Data cleaning
- NaN treatment
- Status normalization
- UTM Coordinates - geographic coordinates conversion
- Distance beetween locals calculation
- Merge in a single DataFrame
- Selection of activities based on target sample

I have created a modules script (clean_functions.py) with the main functions used

In [10]:
import importlib
import clean_functions as cl
importlib.reload(cl)

<module 'clean_functions' from '/home/dsc/Python_notebooks/TFM/TBV/TBV_v1/clean_functions.py'>

### NaN treatment
- NaN should be locals with no activity ('rotulo' == SIN ACTIVIDAD).  
- A couple of 'rotulo' values are wrong in origin file. I correct that in origin.
- I will drop activities == 'LOCAL SIN ACTIVIDAD' because I can't known the activity in advance for the study.   
- I regularize the locals that don't belong to a group (desc_tipo_acceso_local == NaN) with the right values (not group)
- I fill in the column id_situation local that is empty in activities file (although fill in in locals file)
Most of the functions are in the script: clean_functions.py

In [11]:
df_epi19[df_epi19.desc_epigrafe.isnull()].groupby('rotulo').count()

Unnamed: 0_level_0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,cod_barrio_local,id_seccion_censal_local,desc_seccion_censal_local,coordenada_x_local,coordenada_y_local,id_tipo_acceso_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,id_vial_edificio,clase_vial_edificio,desc_vial_edificio,id_ndp_edificio,id_clase_ndp_edificio,nom_edificio,num_edificio,cal_edificio,secuencial_local_PC,id_vial_acceso,clase_vial_acceso,desc_vial_acceso,id_ndp_acceso,id_clase_ndp_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,id_planta_agrupado,id_local_agrupado,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe
rotulo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
SIN ACTIVIDAD,41276,41276,41276,41276,41276,0,0,41276,41276,41276,0,41276,0,41276,41276,41276,41276,41276,41276,41276,41276,41276,41276,0,41276,41276,41276,41276,41276,41276,41276,3017,3017,3017,3017,3017,3017,41072,2992,0,0,0,0,0,0


In [12]:
df_epi18[df_epi18.desc_epigrafe.isnull()].groupby('rotulo').count()

Unnamed: 0_level_0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,cod_barrio_local,id_seccion_censal_local,desc_seccion_censal_local,coordenada_x_local,coordenada_y_local,id_tipo_acceso_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,id_vial_edificio,clase_vial_edificio,desc_vial_edificio,id_ndp_edificio,id_clase_ndp_edificio,nom_edificio,num_edificio,cal_edificio,secuencial_local_PC,id_vial_acceso,clase_vial_acceso,desc_vial_acceso,id_ndp_acceso,id_clase_ndp_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,id_planta_agrupado,id_local_agrupado,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe
rotulo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
SIN ACTIVIDAD,42051,42051,42051,42051,42051,0,0,42051,42051,42051,0,42051,0,42051,42051,42051,42051,42051,42051,42051,42051,42051,42051,0,42051,42051,42051,42051,42051,42051,42051,3056,3056,3056,3056,3056,3056,41840,3031,0,0,0,0,0,0


In [13]:
df_epi17[df_epi17.desc_epigrafe.isnull()].groupby('rotulo').count()

Unnamed: 0_level_0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,cod_barrio_local,id_seccion_censal_local,desc_seccion_censal_local,coordenada_x_local,coordenada_y_local,id_tipo_acceso_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,id_vial_edificio,clase_vial_edificio,desc_vial_edificio,id_ndp_edificio,id_clase_ndp_edificio,nom_edificio,num_edificio,cal_edificio,secuencial_local_PC,id_vial_acceso,clase_vial_acceso,desc_vial_acceso,id_ndp_acceso,id_clase_ndp_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,id_planta_agrupado,id_local_agrupado,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe
rotulo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
SIN ACTIVIDAD,42527,42527,42527,42527,42527,0,0,42527,42527,42527,0,42527,0,42527,42527,42527,42527,42527,42527,42527,42527,42527,42527,0,42527,42527,42527,42527,42527,42527,42527,3060,3060,3060,3060,3060,3060,42298,3035,0,0,0,0,0,0


In [14]:
df_epi16[df_epi16.desc_epigrafe.isnull()].groupby('rotulo').count()

Unnamed: 0_level_0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,cod_barrio_local,id_seccion_censal_local,desc_seccion_censal_local,coordenada_x_local,coordenada_y_local,id_tipo_acceso_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,id_vial_edificio,clase_vial_edificio,desc_vial_edificio,id_ndp_edificio,id_clase_ndp_edificio,nom_edificio,num_edificio,cal_edificio,secuencial_local_PC,id_vial_acceso,clase_vial_acceso,desc_vial_acceso,id_ndp_acceso,id_clase_ndp_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,id_planta_agrupado,id_local_agrupado,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe
rotulo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
SIN ACTIVIDAD,43549,43549,43549,43549,43549,0,0,43549,43549,43549,0,43549,0,43549,43549,43549,43549,43549,43549,43549,43549,43549,43549,0,43549,43549,43549,43549,43549,43549,43549,3005,3005,3005,3005,3005,3005,2882,2981,0,0,0,0,0,0


In [15]:
df_epi15[df_epi15.desc_epigrafe.isnull()].groupby('rotulo').count()

Unnamed: 0_level_0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,cod_barrio_local,id_seccion_censal_local,desc_seccion_censal_local,coordenada_x_local,coordenada_y_local,id_tipo_acceso_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,id_vial_edificio,clase_vial_edificio,desc_vial_edificio,id_ndp_edificio,id_clase_ndp_edificio,nom_edificio,num_edificio,cal_edificio,secuencial_local_PC,id_vial_acceso,clase_vial_acceso,desc_vial_acceso,id_ndp_acceso,id_clase_ndp_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,id_planta_agrupado,id_local_agrupado,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe
rotulo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
SIN ACTIVIDAD,44915,44915,44915,44915,44915,0,0,44915,44915,44915,0,44915,0,44915,44915,44915,44915,44915,44915,44915,44915,44915,44915,0,44915,44915,44915,44915,44915,44915,44915,3066,3066,3066,3066,3066,3066,2940,3037,0,0,0,0,0,0


New column: 'conc': concat('rotulo','desc_vial_acceso') to identify unique locals
All this with function **cl.new_col**

In [16]:
df_epi19 = cl.new_col(df_epi19)
df_epi18 = cl.new_col(df_epi18)
df_epi17 = cl.new_col(df_epi17)
df_epi16 = cl.new_col(df_epi16)
df_epi15 = cl.new_col(df_epi15)

In [17]:
df_epi15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157206 entries, 0 to 157205
Data columns (total 47 columns):
id_local                     157206 non-null int64
id_distrito_local            157206 non-null int64
desc_distrito_local          157206 non-null object
id_barrio_local              157206 non-null int64
desc_barrio_local            157206 non-null object
cod_barrio_local             0 non-null float64
id_seccion_censal_local      0 non-null float64
desc_seccion_censal_local    157206 non-null int64
coordenada_x_local           157206 non-null float64
coordenada_y_local           157206 non-null float64
id_tipo_acceso_local         0 non-null float64
desc_tipo_acceso_local       157206 non-null object
id_situacion_local           0 non-null float64
desc_situacion_local         157206 non-null object
id_vial_edificio             157206 non-null int64
clase_vial_edificio          157206 non-null object
desc_vial_edificio           157206 non-null object
id_ndp_edificio         

In [18]:
df_epi19_cl = df_epi19.copy()
df_epi18_cl = df_epi18.copy()
df_epi17_cl = df_epi17.copy()
df_epi16_cl = df_epi16.copy()
df_epi15_cl = df_epi15.copy()

I unify all 'Baja*' and 'LOCAL SIN ACTIVIDAD' status with 'Cerrado' state for all the years DataFrames:
- status that contains 'Baja' 
- those locals with 'desc epigrafe' == 'LOCAL SIN ACTIVIDAD'   

For this I will use **cl.estado** function.

In [19]:
df_epi19_cl['desc_situacion_local'].value_counts()

Abierto               117830
Cerrado                34738
Uso vivienda            6473
Baja Reunificacion      2444
Baja                    1018
En obras                 841
Baja PC Asociado          11
Name: desc_situacion_local, dtype: int64

In [20]:
df_epi19_cl['desc_sit_loc_modif'] = cl.estado(df_epi19_cl['desc_epigrafe'],df_epi19_cl['desc_situacion_local'])
df_epi18_cl['desc_sit_loc_modif'] = cl.estado(df_epi18_cl['desc_epigrafe'],df_epi18_cl['desc_situacion_local'])
df_epi17_cl['desc_sit_loc_modif'] = cl.estado(df_epi17_cl['desc_epigrafe'],df_epi17_cl['desc_situacion_local'])
df_epi16_cl['desc_sit_loc_modif'] = cl.estado(df_epi16_cl['desc_epigrafe'],df_epi16_cl['desc_situacion_local'])
df_epi15_cl['desc_sit_loc_modif'] = cl.estado(df_epi15_cl['desc_epigrafe'],df_epi15_cl['desc_situacion_local'])

In [21]:
df_epi19_cl['desc_sit_loc_modif'].value_counts()

Abierto         116704
Cerrado          39337
Uso vivienda      6473
En obras           841
Name: desc_sit_loc_modif, dtype: int64

I fill NaN with module **cl.na** according to Epigrafes nomenclature (https://datos.madrid.es/FWProjects/egob/Catalogo/Economia/Ficheros/Estructura_DS_FicheroCLA.pdf):   
- Locales with 'rotulo' == 'SIN ACTIVIDAD'.
- I regularize the locals that do not belong to a group and are null
- I fill in the column id_situation local that is null in the Activities file with **cl.id_sit** function

I do this for 2019 DataFrame since it is going to be the baseline to merge with the status of the previous years

In [22]:
df_epi19_cl_d = cl.na(df_epi19_cl)

For all DataFrames (2015 to 2018) but 2019, I drop most of the columns and just keep columns_of_interest=['id_local','conc','desc_sit_loc_modif'] to merge in the final DataFrame . I do this with function: **cl.col_rest**.

In [23]:
df_epi18_cl_d = cl.col_rest(df_epi18_cl)
df_epi17_cl_d = cl.col_rest(df_epi17_cl)
df_epi16_cl_d = cl.col_rest(df_epi16_cl)
df_epi15_cl_d = cl.col_rest(df_epi15_cl)

I check there are no duplicates and reset_index with function **cl.reset** before final merge

In [24]:
#drop duplicates
df_epi15_cl_d.drop_duplicates(inplace = True)
df_epi16_cl_d.drop_duplicates(inplace = True)
df_epi17_cl_d.drop_duplicates(inplace = True)
df_epi18_cl_d.drop_duplicates(inplace = True)
df_epi19_cl_d.drop_duplicates(inplace = True)

#id_local ordered and reset index
cl.reset(df_epi15_cl_d)
cl.reset(df_epi16_cl_d)
cl.reset(df_epi17_cl_d)
cl.reset(df_epi18_cl_d)
cl.reset(df_epi19_cl_d);

And check that there are no nulls

In [25]:
df_epi19_cl_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163355 entries, 0 to 163354
Data columns (total 30 columns):
id_local                   163355 non-null int64
id_distrito_local          163355 non-null int64
desc_distrito_local        163355 non-null object
id_barrio_local            163355 non-null int64
desc_barrio_local          163355 non-null object
coordenada_x_local         163355 non-null object
coordenada_y_local         163355 non-null object
desc_tipo_acceso_local     163355 non-null object
id_situacion_local         163355 non-null float64
desc_situacion_local       163355 non-null object
clase_vial_acceso          163355 non-null object
desc_vial_acceso           163355 non-null object
nom_acceso                 163355 non-null object
num_acceso                 163355 non-null int64
cal_acceso                 163355 non-null object
coordenada_x_agrupacion    163355 non-null object
coordenada_y_agrup         163355 non-null object
id_agrupacion              163355 non-null

In [26]:
df_epi18_cl_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146623 entries, 0 to 146622
Data columns (total 3 columns):
id_local              146623 non-null int64
conc                  146623 non-null object
desc_sit_loc_modif    146623 non-null object
dtypes: int64(1), object(2)
memory usage: 3.4+ MB


In [27]:
df_epi17_cl_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145770 entries, 0 to 145769
Data columns (total 3 columns):
id_local              145770 non-null int64
conc                  145770 non-null object
desc_sit_loc_modif    145770 non-null object
dtypes: int64(1), object(2)
memory usage: 3.3+ MB


In [28]:
df_epi16_cl_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144936 entries, 0 to 144935
Data columns (total 3 columns):
id_local              144936 non-null int64
conc                  144936 non-null object
desc_sit_loc_modif    144936 non-null object
dtypes: int64(1), object(2)
memory usage: 3.3+ MB


In [29]:
df_epi15_cl_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144057 entries, 0 to 144056
Data columns (total 3 columns):
id_local              144057 non-null int64
conc                  144057 non-null object
desc_sit_loc_modif    144057 non-null object
dtypes: int64(1), object(2)
memory usage: 3.3+ MB


I join all the info in a single Dataframe

In [30]:
columns_to_merge = ['id_local','conc']

In [31]:
df_local16 = df_epi16_cl_d.merge(df_epi15_cl_d, how= 'left', 
                                    on=columns_to_merge,
                                    suffixes=('_16','_15'))
df_local16.shape, len(df_local16['id_local'].unique()), len(df_epi16_cl_d['id_local'].unique())

((144954, 4), 144920, 144920)

In [32]:
df_local17 = df_epi17_cl_d.merge(df_local16, how= 'left', 
                                    on=columns_to_merge,
                                    suffixes=('_17','_16'))
df_local17.shape, len(df_local17['id_local'].unique()), len(df_epi17_cl_d['id_local'].unique())

((145831, 5), 145752, 145752)

In [33]:
df_local18 = df_epi18_cl_d.merge(df_local17, how= 'left', 
                                    on=columns_to_merge,
                                    suffixes=('_18','_17'))
df_local18.shape, len(df_local18['id_local'].unique()), len(df_epi18_cl_d['id_local'].unique())

((146752, 6), 146607, 146607)

In [220]:
df_local19 = df_epi19_cl_d.merge(df_local18, how= 'left', 
                                    on=columns_to_merge,
                                    suffixes=('_19','_18'))
names = df_local19.columns.tolist()
names[names.index('desc_sit_loc_modif')] = 'desc_sit_loc_modif_19'
df_local19.columns = names
df_local19.shape, len(df_local19['id_local'].unique()), len(df_epi19_cl_d['id_local'].unique())

((163652, 34), 147344, 147344)

In [35]:
df_local19.columns

Index(['id_local', 'id_distrito_local', 'desc_distrito_local',
       'id_barrio_local', 'desc_barrio_local', 'coordenada_x_local',
       'coordenada_y_local', 'desc_tipo_acceso_local', 'id_situacion_local',
       'desc_situacion_local', 'clase_vial_acceso', 'desc_vial_acceso',
       'nom_acceso', 'num_acceso', 'cal_acceso', 'coordenada_x_agrupacion',
       'coordenada_y_agrup', 'id_agrupacion', 'nombre_agrupacion',
       'id_tipo_agrup', 'desc_tipo_agrup', 'rotulo', 'id_seccion',
       'desc_seccion', 'id_division', 'desc_division', 'id_epigrafe',
       'desc_epigrafe', 'conc', 'desc_sit_loc_modif_19',
       'desc_sit_loc_modif_18', 'desc_sit_loc_modif_17',
       'desc_sit_loc_modif_16', 'desc_sit_loc_modif_15'],
      dtype='object')

In [218]:
df_local19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 163652 entries, 0 to 163651
Data columns (total 34 columns):
id_local                   163652 non-null int64
id_distrito_local          163652 non-null int64
desc_distrito_local        163652 non-null object
id_barrio_local            163652 non-null int64
desc_barrio_local          163652 non-null object
coordenada_x_local         163652 non-null object
coordenada_y_local         163652 non-null object
desc_tipo_acceso_local     163652 non-null object
id_situacion_local         163652 non-null float64
desc_situacion_local       163652 non-null object
clase_vial_acceso          163652 non-null object
desc_vial_acceso           163652 non-null object
nom_acceso                 163652 non-null object
num_acceso                 163652 non-null int64
cal_acceso                 163652 non-null object
coordenada_x_agrupacion    163652 non-null object
coordenada_y_agrup         163652 non-null object
id_agrupacion              163652 non-null

I see that some numeric columns have 'object' type ('id_epigrafe', 'id_division'). I drop non numeric values (PTECO1 == 'no activity pending to code') and unify the type

In [37]:
df_local19_f = df_local19[(df_local19.id_epigrafe != 'PTECO1')].copy()
cl.reset(df_local19_f);
cols = ['id_division','id_epigrafe']
df_local19_f.loc[:,cols] = df_local19_f.loc[:,cols].applymap(np.int64);
df_local19_f.id_division.values

array([56, 56, 56, ..., 33, 64, 64])

## If I want to delete LOCAL SIN ACTIVIDAD
Finally, I drop locals with 'LOCAL SIN ACTIVIDAD' description

In [38]:
df_local19_f = df_local19_f[df_local19_f.desc_epigrafe != 'LOCAL SIN ACTIVIDAD']

In [39]:
# Y me guardo los locales sin actividad que tengo geoposicionados
df_no_act = df_local19[(df_local19.desc_epigrafe == 'LOCAL SIN ACTIVIDAD') & (df_local19.coordenada_x_local != 0)]
len(df_local19[(df_local19.desc_epigrafe == 'LOCAL SIN ACTIVIDAD') & (df_local19.coordenada_x_local != 0)])
df_no_act.head()

Unnamed: 0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,coordenada_x_local,coordenada_y_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,clase_vial_acceso,desc_vial_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,rotulo,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe,conc,desc_sit_loc_modif_19,desc_sit_loc_modif_18,desc_sit_loc_modif_17,desc_sit_loc_modif_16,desc_sit_loc_modif_15
185,10001291,1,CENTRO,104,JUSTICIA,44098959,447497452,Puerta Calle,1.0,Abierto,CALLE,BARQUILLO ...,NUM,33,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,OFELIA,Z,SIN ACTIVIDAD,0,SIN ACTIVIDAD,0,LOCAL SIN ACTIVIDAD,OFELIA-BARQUILLO-33,Cerrado,,,,
205,10001407,1,CENTRO,103,CORTES,44057259,447419253,Puerta Calle,1.0,Abierto,CALLE,PRINCIPE ...,NUM,4,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,GRIGNOLINO,Z,SIN ACTIVIDAD,0,SIN ACTIVIDAD,0,LOCAL SIN ACTIVIDAD,GRIGNOLINO-PRINCIPE-4,Cerrado,Abierto,Abierto,Abierto,Abierto
363,10001884,1,CENTRO,103,CORTES,44040759,447403353,Puerta Calle,1.0,Abierto,CALLE,ESPOZ Y MINA ...,NUM,32,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,MAMA TERESA,Z,SIN ACTIVIDAD,0,SIN ACTIVIDAD,0,LOCAL SIN ACTIVIDAD,MAMA TERESA-ESPOZ Y MINA-32,Cerrado,Cerrado,Abierto,Abierto,Abierto
417,10002055,1,CENTRO,105,UNIVERSIDAD,44037959,447464053,Puerta Calle,1.0,Abierto,CALLE,GRAN VIA ...,NUM,30,B,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,STARBUCKS,Z,SIN ACTIVIDAD,0,SIN ACTIVIDAD,0,LOCAL SIN ACTIVIDAD,STARBUCKS-GRAN VIA-30,Cerrado,Cerrado,Cerrado,Abierto,Abierto
428,10002079,1,CENTRO,101,PALACIO,4399586,447468153,Puerta Calle,1.0,Abierto,CALLE,SILVA ...,NUM,4,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,"TANKAS LA BOUTIQUE DEL CAFE""""",Z,SIN ACTIVIDAD,0,SIN ACTIVIDAD,0,LOCAL SIN ACTIVIDAD,"TANKAS LA BOUTIQUE DEL CAFE""""-SILVA-4",Cerrado,Cerrado,Cerrado,Abierto,Abierto


In [40]:
# I save the locals with 'No activity' and geolocalized in a file ('locals_na') for further usage
df_no_act.to_csv('Data/censolocales/locals_na.csv',index=None, header=True)

Variables open and closed each year with **cl.sit_year**.  
I compare the situation of every local with the previous years ('Cerrado' for closed and 'NaN' for open).   
Example:

     cond1 = ((df_.desc_sit_loc_modif_19 == 'Cerrado') |
                                 (df_.desc_sit_loc_modif_19 == 'Uso vivienda'))     
                                 
     cond2 = ((df_.desc_sit_loc_modif_18 != 'Cerrado') |
                                 df.desc_sit_loc_modif_18.isnull())   
                                 
     cond3 = ((df_.desc_sit_loc_modif_19.notnull()) &
                                 (df_.desc_sit_loc_modif_18.isnull()))   
 
    df_['cerrado_19'] = np.where(cond1 & cond2, 1 ,0)   
    df_['abierto_19'] = np.where(cond3, 1 ,0)

## Retomar codigo desde aqui

In [38]:
df_local19_f = cl.sit_year(df_local19_f)

In [39]:
len(df_local19_f[df_local19_f.id_epigrafe == 474201])

716

In [40]:
len(df_local19_f[df_local19_f.desc_epigrafe == 'COMERCIO AL POR MENOR DE PRODUCTOS DE TELEFONIA Y TELECOMUNICACIONES'])

716

In [41]:
df_local19_f[df_local19_f.desc_epigrafe == 'COMERCIO AL POR MENOR DE PRODUCTOS DE TELEFONIA Y TELECOMUNICACIONES'].groupby('id_epigrafe').count()

Unnamed: 0_level_0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,coordenada_x_local,coordenada_y_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,clase_vial_acceso,desc_vial_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,rotulo,id_seccion,desc_seccion,id_division,desc_division,desc_epigrafe,conc,desc_sit_loc_modif_19,desc_sit_loc_modif_18,desc_sit_loc_modif_17,desc_sit_loc_modif_16,desc_sit_loc_modif_15,cerrado_19,abierto_19,cerrado_18,abierto_18,cerrado_17,abierto_17,cerrado_16,abierto_16
id_epigrafe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
474201,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,716,644,597,517,450,716,716,716,716,716,716,716,716


In [42]:
df_local19_f.columns

Index(['id_local', 'id_distrito_local', 'desc_distrito_local',
       'id_barrio_local', 'desc_barrio_local', 'coordenada_x_local',
       'coordenada_y_local', 'desc_tipo_acceso_local', 'id_situacion_local',
       'desc_situacion_local', 'clase_vial_acceso', 'desc_vial_acceso',
       'nom_acceso', 'num_acceso', 'cal_acceso', 'coordenada_x_agrupacion',
       'coordenada_y_agrup', 'id_agrupacion', 'nombre_agrupacion',
       'id_tipo_agrup', 'desc_tipo_agrup', 'rotulo', 'id_seccion',
       'desc_seccion', 'id_division', 'desc_division', 'id_epigrafe',
       'desc_epigrafe', 'conc', 'desc_sit_loc_modif_19',
       'desc_sit_loc_modif_18', 'desc_sit_loc_modif_17',
       'desc_sit_loc_modif_16', 'desc_sit_loc_modif_15', 'cerrado_19',
       'abierto_19', 'cerrado_18', 'abierto_18', 'cerrado_17', 'abierto_17',
       'cerrado_16', 'abierto_16'],
      dtype='object')

I generate a target variable: locals opened and closed in less that 2 years since 2016 but I don have enough '1s' (less than 0,3%)

In [43]:
# I generate a target variable: locals opened and closed in less that 2 years since 2016 but I don't have enough '1s'
df_local19_ft2 = cl.target(df_local19_f,2,2016)
df_local19_ft2.head()
len(df_local19_ft2[df_local19_ft2.target == 1])/len(df_local19_ft2)

2
2016


0.0032754427347502476

So, I generate an new target variable: activities closed since 2017   
I have still a very umbalanced dataset. I will manage it in the modelling phase

In [227]:
df_local19_ft = cl.target2(df_local19_f,3)
df_local19_ft.head()
len(df_local19_ft[df_local19_ft.target == 1])/len(df_local19_ft)

0.06726879407487076

Now I standardize the type of activity. Those activities with less than median(#locals/activity) in Madrid, are assigned the activity == 'OTRAS ACTIVIDADES'. I generate new variables:   
- desc_act_norm: desc_epigrafe == 'OTRAS ACTIVIDADES' for activities below median(#locals/activity) == 82
- id_act_norm: id_epigrafe == 999999 for activities below median(#locals/activity) == 82

In [45]:
df_local19_ft.id_epigrafe.value_counts().describe(), df_local19_ft.id_epigrafe.value_counts().median()

(count      440.000000
 mean       371.913636
 std       2113.051147
 min          1.000000
 25%         19.000000
 50%         82.500000
 75%        254.000000
 max      42782.000000
 Name: id_epigrafe, dtype: float64, 82.5)

In [238]:
df_local19_ft.id_epigrafe.value_counts().describe(), df_local19_ft.id_epigrafe.value_counts().median()
act = df_local19_ft.desc_epigrafe.value_counts()[df_local19_ft.desc_epigrafe.value_counts() < df_local19_ft.id_epigrafe.value_counts().median()].index
df_local19_ft['desc_act_norm'] = df_local19_ft.apply(lambda x: cl.norm_act(x['desc_epigrafe'],act),axis=1)
df_local19_ft.loc[:,'id_act_norm'] = df_local19_ft.apply(lambda x: cl.norm_id_act(x['desc_epigrafe'],x['id_epigrafe'],act),axis=1)


In [239]:
#I have classified 5594 locals as 'OTRAS ACTIVIDADES'
# The target vs total in group 'Other activities' is aprox. mean()
df_oa = df_local19_ft[df_local19_ft.desc_act_norm == 'OTRAS ACTIVIDADES']
len(df_oa), len(df_oa[df_oa.target == 1])/len(df_oa)

(5676, 0.023960535588442564)

Generate a new variable 'ab_17_19' that indicates whether a local has been opened in the last 3 years: 'abierto_17'+'abierto_18'+'abierto_19' 2017 to 2019.

In [241]:
df_local19_ft = cl.kpis_total(df_local19_ft)

In [243]:
df_local19_ft.head(2)

Unnamed: 0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,coordenada_x_local,coordenada_y_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,clase_vial_acceso,desc_vial_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,rotulo,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe,conc,desc_sit_loc_modif_19,desc_sit_loc_modif_18,desc_sit_loc_modif_17,desc_sit_loc_modif_16,desc_sit_loc_modif_15,cerrado_19,abierto_19,cerrado_18,abierto_18,cerrado_17,abierto_17,cerrado_16,abierto_16,target,desc_act_norm,id_act_norm,ab_17_19,loc_dist_act,ab_dist_act_17_19,total_loc_act,total_ab_act_17_19,loc_dist,ab_dist_17_19,loc_na_dist,ab_dist_act_17_19_rate,total_ab_act_17_19_rate,total_ab_dist_17_19_rate,total_na_dist_rate,loc_barrio_act,ab_barrio_act_17_19,loc_barrio,ab_barrio_17_19,loc_na_barrio,ab_barrio_act_17_19_rate,total_ab_barr_17_19_rate,total_na_barr_rate
0,10000003,1,CENTRO,104,JUSTICIA,44055459,447533853,Puerta Calle,1.0,Abierto,CALLE,BARCELO ...,NUM,5,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,VITACA,I,HOSTELERIA,56,SERVICIOS DE COMIDAS Y BEBIDAS,561004,BAR RESTAURANTE,VITACA-BARCELO-5,Abierto,Abierto,Abierto,Abierto,Abierto,0,0,0,0,0,0,0,0,0,BAR RESTAURANTE,561004,0,726,233.0,4320,1298.0,14344,2926,2166,32.093664,30.046296,20.398773,15.10039,108,28.0,2086,528,131,25.925926,25.311601,6.279962
1,10000003,1,CENTRO,104,JUSTICIA,44055459,447533853,Puerta Calle,1.0,Abierto,CALLE,BARCELO ...,NUM,5,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,VITACA,I,HOSTELERIA,56,SERVICIOS DE COMIDAS Y BEBIDAS,561006,CAFETERIA,VITACA-BARCELO-5,Abierto,Abierto,Abierto,Abierto,Abierto,0,0,0,0,0,0,0,0,0,CAFETERIA,561006,0,544,112.0,3247,935.0,14344,2926,2166,20.588235,28.795812,20.398773,15.10039,72,16.0,2086,528,131,22.222222,25.311601,6.279962


In [245]:
df_locals = df_local19_ft.copy()
cl.reset(df_locals)
df_locals.shape, len(df_locals.id_local.unique())

((163642, 65), 147334)

In [246]:
df_locals.desc_epigrafe.value_counts()

LOCAL SIN ACTIVIDAD                                                                                                                       42782
SERVICIO DE PELUQUERIA                                                                                                                     5260
BAR CON COCINA                                                                                                                             4572
COMERCIO AL POR MENOR DE PRENDAS DE VESTIR EN ESTABLECIMIENTOS ESPECIALIZADOS                                                              4470
BAR RESTAURANTE                                                                                                                            4320
CAFETERIA                                                                                                                                  3247
RESTAURANTE                                                                                                                             

New variable 'num_act': number of diferent activities licenced for a single local 

In [247]:
df_locals_v1 = pd.DataFrame(df_locals, columns=['id_local','desc_epigrafe'])
df_locals_v1['act'] = int(1)
df_locals_v1.drop_duplicates(inplace = True)

df_locals_v2 = df_locals_v1.pivot(index='id_local', columns='desc_epigrafe', values='act')
df_locals_v2.fillna(0, inplace=True)
df_locals_v3 = df_locals_v2.astype(int).reset_index()
df_locals_v3['num_act'] = df_locals_v3.sum(axis =1).astype(np.int64) - df_locals_v3['id_local']
df_locals_v3 = df_locals_v3[['id_local','num_act']]
df_locals_v3.head()

desc_epigrafe,id_local,num_act
0,10000003,2
1,10000004,1
2,10000013,1
3,10000044,1
4,10000052,1


In [248]:
# compares unique locals with those with multiple activities (1089)
df_locals_v4 = df_locals.merge(df_locals_v3, how='left',on=['id_local'])
cl.reset(df_locals_v4)
df_locals_v4.shape, len(df_locals_v4), len(df_locals_v4[df_locals_v4.num_act > 1])

((163642, 66), 163642, 28038)

Now I convert UTM into geographical coordinates and count the number of locals of the same activity within a range (this function requires a lot of compute capacity, I will apply it only for analysis of one local each time). 
First I have to unify colective groups coordinate with single commerce coordinates in the same variable

In [249]:
df_locals_v5 = df_locals_v4.copy()
df_locals_v5['coord_x_f'] = [c.replace(',', '.') for c in df_locals_v5['coordenada_x_local'].values]
df_locals_v5['coord_y_f'] = [c.replace(',', '.') for c in df_locals_v5['coordenada_y_local'].values]
df_locals_v5['coord_x_af'] = [c.replace(',', '.') for c in df_locals_v5['coordenada_x_agrupacion'].astype(str).values]
df_locals_v5['coord_y_af'] = [c.replace(',', '.') for c in df_locals_v5['coordenada_y_agrup'].astype(str).values]

df_locals_v5['coord_x_f'] = df_locals_v5['coord_x_f'].map(float)
df_locals_v5['coord_y_f'] = df_locals_v5['coord_y_f'].map(float)
df_locals_v5['coord_x_af'] = df_locals_v5['coord_x_af'].map(float)
df_locals_v5['coord_y_af'] = df_locals_v5['coord_y_af'].map(float)

df_locals_v5['coord_x_final'] = np.where(df_locals_v5['desc_tipo_acceso_local'] == 'Agrupado', df_locals_v5['coord_x_af'] , df_locals_v5['coord_x_f'])
df_locals_v5['coord_y_final'] = np.where(df_locals_v5['desc_tipo_acceso_local'] == 'Agrupado', df_locals_v5['coord_y_af'] , df_locals_v5['coord_y_f'])

Now I generate lat and lon. I have to remove coordinates with coord_x_final (UTM easting) == 0
Madrid is in zone 30 T

In [250]:
import geopy
import utm

I have to drop coor_x == 0 because can't not be converted into geographic coordinates. They are less than 3% of the samples

In [251]:
len(df_locals_v5[df_locals_v5['coord_x_final'] == 0])/len(df_locals_v5)

0.02818347368035101

In [252]:
df_locals_v6 = df_locals_v5[df_locals_v5.coord_x_final != 0].copy()
cl.reset(df_locals_v6);

After previous transformations, I convert UTM lo latlon

In [253]:
df_locals_v6.loc[df_locals_v6.index.values,'lat'],df_locals_v6.loc[df_locals_v6.index.values,'lon'] = utm.to_latlon(df_locals_v6.loc[df_locals_v6.index.values,'coord_x_final'],df_locals_v6.loc[df_locals_v6.index.values,'coord_y_final'],30,'T')

In [254]:
#11% de locales tienen más de una actividad
df_locals_v6.shape,df_locals_v6.shape[0]/len(df_locals_v6.id_local.unique())

((159030, 74), 1.110614493927691)

In [255]:
df_locals_v6.columns

Index(['id_local', 'id_distrito_local', 'desc_distrito_local',
       'id_barrio_local', 'desc_barrio_local', 'coordenada_x_local',
       'coordenada_y_local', 'desc_tipo_acceso_local', 'id_situacion_local',
       'desc_situacion_local', 'clase_vial_acceso', 'desc_vial_acceso',
       'nom_acceso', 'num_acceso', 'cal_acceso', 'coordenada_x_agrupacion',
       'coordenada_y_agrup', 'id_agrupacion', 'nombre_agrupacion',
       'id_tipo_agrup', 'desc_tipo_agrup', 'rotulo', 'id_seccion',
       'desc_seccion', 'id_division', 'desc_division', 'id_epigrafe',
       'desc_epigrafe', 'conc', 'desc_sit_loc_modif_19',
       'desc_sit_loc_modif_18', 'desc_sit_loc_modif_17',
       'desc_sit_loc_modif_16', 'desc_sit_loc_modif_15', 'cerrado_19',
       'abierto_19', 'cerrado_18', 'abierto_18', 'cerrado_17', 'abierto_17',
       'cerrado_16', 'abierto_16', 'target', 'desc_act_norm', 'id_act_norm',
       'ab_17_19', 'loc_dist_act', 'ab_dist_act_17_19', 'total_loc_act',
       'total_ab_act_17_19'

In [175]:
df_locals_v6.head(2)

Unnamed: 0,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,coordenada_x_local,coordenada_y_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,clase_vial_acceso,desc_vial_acceso,nom_acceso,num_acceso,cal_acceso,coordenada_x_agrupacion,coordenada_y_agrup,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,rotulo,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,desc_epigrafe,conc,desc_sit_loc_modif_19,desc_sit_loc_modif_18,desc_sit_loc_modif_17,desc_sit_loc_modif_16,desc_sit_loc_modif_15,cerrado_19,abierto_19,cerrado_18,abierto_18,cerrado_17,abierto_17,cerrado_16,abierto_16,target,desc_act_norm,id_act_norm,ab_17_19,loc_dist_act,ab_dist_act_17_19,total_loc_act,total_ab_act_17_19,loc_dist,ab_dist_17_19,ab_dist_act_17_19_rate,total_ab_act_17_19_rate,total_ab_dist_17_19_rate,loc_barrio_act,ab_barrio_act_17_19,loc_barrio,ab_barrio_17_19,ab_barrio_act_17_19_rate,total_ab_barr_17_19_rate,num_act,prueba,coord_x_f,coord_y_f,coord_x_af,coord_y_af,coord_x_final,coord_y_final,lat,lon
0,10000003,1,CENTRO,104,JUSTICIA,44055459,447533853,Puerta Calle,1.0,Abierto,CALLE,BARCELO ...,NUM,5,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,VITACA,I,HOSTELERIA,56,SERVICIOS DE COMIDAS Y BEBIDAS,561004,BAR RESTAURANTE,VITACA-BARCELO-5,Abierto,Abierto,Abierto,Abierto,Abierto,0,0,0,0,0,0,0,0,0,BAR RESTAURANTE,561004,0,726,233.0,4320,1298.0,14344,2926,32.093664,30.046296,20.398773,108,28.0,2086,528,25.925926,25.311601,2,prueba,440554.59,4475338.53,0.0,0.0,440554.59,4475338.53,40.426558,-3.700788
1,10000003,1,CENTRO,104,JUSTICIA,44055459,447533853,Puerta Calle,1.0,Abierto,CALLE,BARCELO ...,NUM,5,,0,0,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,VITACA,I,HOSTELERIA,56,SERVICIOS DE COMIDAS Y BEBIDAS,561006,CAFETERIA,VITACA-BARCELO-5,Abierto,Abierto,Abierto,Abierto,Abierto,0,0,0,0,0,0,0,0,0,CAFETERIA,561006,0,544,112.0,3247,935.0,14344,2926,20.588235,28.795812,20.398773,72,16.0,2086,528,22.222222,25.311601,2,prueba,440554.59,4475338.53,0.0,0.0,440554.59,4475338.53,40.426558,-3.700788


### Population info

With information of population

In [256]:
censo = df1.merge(df2,how='left', on='id_distrito_local')
censo = censo.drop(columns=['distrito','Una mujer sola de 16 a 64 años ', 'Un hombre solo de 16 a 64 años',
       'Una mujer sola de 65 o más años', 'Un hombre solo de 65 o más años',
       'Una mujer adulta  con uno o más menores',
       'Un hombre adulto  con uno o más menores',
       'Dos adultos de 16 a 64 años, sin menores',
       'Dos adultos, uno al menos de 65 o más años, sin menores',
       'Dos adultos y un menor', 'Dos adultos y dos menores',
       'Dos adultos y tres o más menores',
       'Dos adultos de 35 años o más, otro de 16 a 34 años, sin menores',
       'Dos adultos de 35 años o más, otro de 16 a 34 años y un menor',
       'Dos adultos de 35 años o más, otro de 16 a 34 años y dos o más menores',
       'Otro hogar de tres adultos, con o sin menores',
       'Dos adultos de 35 años o más, dos de 16 a 34 años, sin menores',
       'Dos adultos de 35 años o más, dos de 16 a 34 años y un menor',
       'Dos adultos de 35 años o más, dos  de 16 a 34 años y dos o más menores',
       'Otro hogar de cuatro adultos, con o sin menores',
       'Cinco o más adultos, con o sin menores',
       'Hogar con 15 ó más habitantes', 'Hogares con menores solos'])
censo.columns

Index(['id_distrito_local', 'Población', 'Hombre', 'Mujeres',
       'Densidad (Habitantes / Ha.)', 'Edad promedio', 'Total Hogares',
       'Españoles', 'Extranjeros', 'Mixtos', 'España fuera barrio dia laboral',
       'Extranjero fuera barrio dia laboral', 'Total fuera barrio dia laboral',
       'España fuera barrio fin semana', 'Extranjero fuera barrio fin semana',
       'Total fuera barrio fin semana', 'Total barrio dia laboral',
       'Total barrio fin semana', 'Total trabajo dia laboral',
       'Total trabajo fin semana'],
      dtype='object')

In [257]:
censo['total_TF_week']= (censo['Total trabajo fin semana']
                         + censo['Total fuera barrio fin semana']
                         + censo['Total barrio fin semana']
                         + censo['Total trabajo dia laboral']
                         + censo['Total fuera barrio dia laboral']
                         + censo['Total barrio dia laboral'])

censo['ratio_t_total'] = (censo['Total trabajo fin semana']
                          +censo['Total trabajo dia laboral'])/censo['total_TF_week']
censo['ratio_fb_total'] = (censo['Total fuera barrio fin semana']
                           +censo['Total fuera barrio dia laboral'])/censo['total_TF_week']
censo['ratio_b_total'] = (censo['Total barrio fin semana']
                          +censo['Total barrio dia laboral'])/censo['total_TF_week']

I create a new variable 'dist_type' to codify the type of district:
- 'Commercial' == 1
- 'Residential' == 2
- 'Mix' == 3 

In [258]:
fb_75 = np.percentile(censo['ratio_fb_total'], 75)
b_75 = np.percentile(censo['ratio_b_total'], 75)

def dist_type(ratio_fb,ratio_b):
    if ratio_fb > fb_75:
        return 1
    elif ratio_b > b_75:
        return 2
    else:
        return 3
censo['dist_type'] = censo.apply(lambda x: dist_type(x['ratio_fb_total'],x['ratio_b_total']),axis=1)

In [259]:
censo.head()

Unnamed: 0,id_distrito_local,Población,Hombre,Mujeres,Densidad (Habitantes / Ha.),Edad promedio,Total Hogares,Españoles,Extranjeros,Mixtos,España fuera barrio dia laboral,Extranjero fuera barrio dia laboral,Total fuera barrio dia laboral,España fuera barrio fin semana,Extranjero fuera barrio fin semana,Total fuera barrio fin semana,Total barrio dia laboral,Total barrio fin semana,Total trabajo dia laboral,Total trabajo fin semana,total_TF_week,ratio_t_total,ratio_fb_total,ratio_b_total,dist_type
0,1,115198,58027,57171,257.985172,44.026383,67296,50164,8979,8153,1956512,348482,2304994,960027,169516,1129543,623007,241830,363775,110026,4773175,0.099263,0.71955,0.181187,1
1,2,123013,56064,66949,238.046748,44.545401,65005,56759,2798,5448,900033,45873,945906,359245,18421,377666,655916,254049,190696,55740,2479973,0.09937,0.533704,0.366925,3
2,3,95784,42275,53509,218.394417,47.041636,48673,43211,1881,3581,1085546,115080,1200626,423688,52559,476247,512972,197762,222149,64114,2673870,0.107059,0.627133,0.265807,1
3,4,117988,50420,67568,271.025704,46.241563,62951,51490,5405,6056,1299023,122121,1421144,517838,55491,573329,625927,241789,377950,111528,3351667,0.14604,0.595069,0.258891,3
4,5,114571,49918,64653,158.972579,45.38898,58240,49825,3021,5394,1482648,112338,1594986,530814,41799,572613,629355,242013,389699,106530,3535196,0.140368,0.613148,0.246484,1


In [260]:
censo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 0 to 20
Data columns (total 25 columns):
id_distrito_local                      21 non-null int64
Población                              21 non-null int64
Hombre                                 21 non-null int64
Mujeres                                21 non-null int64
Densidad (Habitantes / Ha.)            21 non-null float64
Edad promedio                          21 non-null float64
Total Hogares                          21 non-null int64
Españoles                              21 non-null int64
Extranjeros                            21 non-null int64
Mixtos                                 21 non-null int64
España fuera barrio dia laboral        21 non-null int64
Extranjero fuera barrio dia laboral    21 non-null int64
Total fuera barrio dia laboral         21 non-null int64
España fuera barrio fin semana         21 non-null int64
Extranjero fuera barrio fin semana     21 non-null int64
Total fuera barrio fin semana         

I check the statistics. The extreme values are in the differences between floating traffic within districts mainly

In [261]:
censo.describe()

Unnamed: 0,id_distrito_local,Población,Hombre,Mujeres,Densidad (Habitantes / Ha.),Edad promedio,Total Hogares,Españoles,Extranjeros,Mixtos,España fuera barrio dia laboral,Extranjero fuera barrio dia laboral,Total fuera barrio dia laboral,España fuera barrio fin semana,Extranjero fuera barrio fin semana,Total fuera barrio fin semana,Total barrio dia laboral,Total barrio fin semana,Total trabajo dia laboral,Total trabajo fin semana,total_TF_week,ratio_t_total,ratio_fb_total,ratio_b_total,dist_type
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,11.0,120238.52381,54560.952381,65677.571429,142.549481,43.907969,61436.380952,51351.142857,3957.52381,6127.714286,913955.3,69671.047619,983626.3,363881.047619,29698.857143,393579.9,668051.2,259142.714286,234471.761905,66906.761905,2605779.0,0.113124,0.514946,0.37193,2.285714
std,6.204837,43415.307389,19672.3933,23920.86213,98.207354,2.243709,21613.443271,17825.203798,2117.751228,2728.174191,396771.2,77584.49718,459672.6,181078.054614,37716.764627,213907.5,235460.0,91519.726899,100183.33312,27777.790499,860151.8,0.02109,0.10875,0.120509,0.845154
min,1.0,35949.0,16869.0,19080.0,10.344042,38.825609,18269.0,15912.0,815.0,1542.0,306586.0,4929.0,311515.0,122010.0,2035.0,124045.0,202060.0,78679.0,80461.0,24059.0,1124201.0,0.083173,0.356969,0.172583,1.0
25%,6.0,95784.0,42275.0,53509.0,68.661063,42.279377,48673.0,39013.0,2346.0,4341.0,630155.0,17628.0,647617.0,246339.0,7104.0,252667.0,533730.0,205283.0,153482.0,42229.0,1901033.0,0.092973,0.430957,0.264063,2.0
50%,11.0,115198.0,51143.0,64653.0,156.841593,44.026383,62013.0,51347.0,3361.0,5817.0,865594.0,53443.0,899526.0,326400.0,19866.0,343933.0,629355.0,242013.0,222149.0,64114.0,2643653.0,0.114327,0.499644,0.377971,3.0
75%,16.0,139147.0,64183.0,74964.0,218.394417,45.783141,70099.0,60925.0,5405.0,8153.0,1156001.0,86168.0,1216536.0,423688.0,39454.0,476247.0,765169.0,293485.0,317615.0,90215.0,3152694.0,0.129359,0.606578,0.473789,3.0
max,21.0,192933.0,87414.0,105519.0,298.017711,47.545059,95680.0,80186.0,8979.0,11865.0,1956512.0,348482.0,2304994.0,960027.0,169516.0,1129543.0,1093869.0,427359.0,389699.0,111528.0,4773175.0,0.146505,0.71955,0.553191,3.0


I merge locals with censo

In [270]:
df_locals_v7 = df_locals_v6.merge(censo, how='left', on='id_distrito_local').sort_values('id_local').reset_index(drop=True)
df_locals_v7.fillna('No disponible',inplace=True)
df_locals_v7.drop(columns=['coordenada_x_local','coordenada_y_local',
                           'coordenada_x_agrupacion','coordenada_y_agrup',
                           'coord_x_f','coord_y_f', 'coord_x_af','coord_y_af'],inplace=True)
cl.reset(df_locals_v7)
df_locals_v7.columns, df_locals_v7.shape

(Index(['id_local', 'id_distrito_local', 'desc_distrito_local',
        'id_barrio_local', 'desc_barrio_local', 'desc_tipo_acceso_local',
        'id_situacion_local', 'desc_situacion_local', 'clase_vial_acceso',
        'desc_vial_acceso', 'nom_acceso', 'num_acceso', 'cal_acceso',
        'id_agrupacion', 'nombre_agrupacion', 'id_tipo_agrup',
        'desc_tipo_agrup', 'rotulo', 'id_seccion', 'desc_seccion',
        'id_division', 'desc_division', 'id_epigrafe', 'desc_epigrafe', 'conc',
        'desc_sit_loc_modif_19', 'desc_sit_loc_modif_18',
        'desc_sit_loc_modif_17', 'desc_sit_loc_modif_16',
        'desc_sit_loc_modif_15', 'cerrado_19', 'abierto_19', 'cerrado_18',
        'abierto_18', 'cerrado_17', 'abierto_17', 'cerrado_16', 'abierto_16',
        'target', 'desc_act_norm', 'id_act_norm', 'ab_17_19', 'loc_dist_act',
        'ab_dist_act_17_19', 'total_loc_act', 'total_ab_act_17_19', 'loc_dist',
        'ab_dist_17_19', 'loc_na_dist', 'ab_dist_act_17_19_rate',
        'total

In [269]:
df_locals_v7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159030 entries, 0 to 159029
Data columns (total 98 columns):
id_local                               159030 non-null int64
id_distrito_local                      159030 non-null int64
desc_distrito_local                    159030 non-null object
id_barrio_local                        159030 non-null int64
desc_barrio_local                      159030 non-null object
coordenada_x_local                     159030 non-null object
coordenada_y_local                     159030 non-null object
desc_tipo_acceso_local                 159030 non-null object
id_situacion_local                     159030 non-null float64
desc_situacion_local                   159030 non-null object
clase_vial_acceso                      159030 non-null object
desc_vial_acceso                       159030 non-null object
nom_acceso                             159030 non-null object
num_acceso                             159030 non-null int64
cal_acceso              

## File with the locals for the study
I save this DataFrame to a CSV: I am going to test locals with no activity and with activity

In [271]:
df_locals_v7.to_csv('Data/censolocales/locals_total.csv',index=None, header=True)

## Filter to increase the 1s

I filter activities with have more target values for modeling: % of ones vs total > 4%

In [291]:
df_locals_v8 = df_locals_v7[df_locals_v7.id_epigrafe != 0]
id_act_fil = cl.act_filter_id_norm(df_locals_v8,100,0.04)
desc_act_fil = cl.act_filter_desc_norm(df_locals_v8,100,0.04)
for i in desc_act_fil.index.values:
       desc_act_fil.loc[desc_act_fil.index == i,'id'] = df_locals_v8.loc[df_locals_v8.desc_epigrafe == i].id_epigrafe.unique()[0]
desc_act_fil['id'] = desc_act_fil['id'].apply(np.int)
desc_act_fil

Unnamed: 0,act_count,ones,perc,id
TRATAMIENTO HIGIENICO DE ANIMALES (PELUQUERIAS),215,30.0,0.139535,960902
COMERCIO AL POR MENOR DE CASQUERIA,137,13.0,0.094891,472207
CENTROS DE JUEGOS O CELEBRACIONES INFANTILES SIN COCINA,120,11.0,0.091667,932002
CLINICA VETERINARIA SIN TRATAMIENTO HIGIENICO,102,8.0,0.078431,750003
COMERCIO AL POR MENOR DE CARNICERIA-CHARCUTERIA,219,16.0,0.073059,472203
COMERCIO AL POR MENOR DE ARTICULOS DEPORTIVOS,315,23.0,0.073016,476403
REPARACION DE OTROS EFECTOS PERSONALES Y ARTICULOS DE USO DOMESTICO N.C.O.P.,424,30.0,0.070755,952005
COMERCIO AL POR MENOR DE COMPLEMENTOS Y ALIMENTOS PARA ANIMALES DE COMPAÑIA,236,16.0,0.067797,477602
COMERCIO AL POR MENOR DE PESCADOS Y MARISCOS SIN OBRADOR,864,58.0,0.06713,472302
ESTABLECIMIENTOS DE VENTA DE PLATOS PREPARADOS CON OBRADOR,772,50.0,0.064767,472407


In [292]:
len(desc_act_fil),desc_act_fil.ones.sum(),desc_act_fil.act_count.sum()

(25, 641.0, 10709)

They are 25 activities + Locales with no activity. I have improved the % of ones vs ceros in the Dataframe (fron 2% to 6%). I will start with this

In [293]:
# V7 includes 'LOCALES SIN ACTIVDAD' and v8 removes 'LOCALES SIN ACTIVIDAD'
len(df_locals_v7[df_locals_v7.target==1])/len(df_locals_v7),len(df_locals_v8[df_locals_v8.target==1])/len(df_locals_v8)

(0.06572344840596114, 0.020744553915385402)

In [294]:
desc_act_fil.ones.sum()/desc_act_fil.act_count.sum()

0.059856195723223454

I with do the first modeling with these activities (desc_act_fil.id.values)

In [295]:
id_epig_keep = desc_act_fil.id.values

In [296]:
df_locals_short = df_locals_v8[df_locals_v8.id_act_norm.isin(id_epig_keep)].copy()
cl.reset(df_locals)
df_locals_short.shape, len(df_locals_short.id_local.unique()), len(id_epig_keep)

((10709, 90), 9000, 25)

I calculate the number of local with the same activity within a radius

In [301]:
from geopy.distance import geodesic

def points_in_radius(point1, point2,radius):
    return geodesic(point1, point2).meters < radius

df_ = df_locals_short.copy()
df_['point'] = list(zip(df_.loc[:,'lat'],df_.loc[:,'lon']))
df_ = (df_.groupby('desc_epigrafe')['point']
       .agg(list)
       .reset_index(name='tipo_puntos')
       .merge(df_, on='desc_epigrafe',how='inner'))
df_['points_in_radius'] = df_.apply(lambda x: len([punto for punto in x['tipo_puntos'] 
                                                   if points_in_radius(x['point'],punto,radius=500)]),axis=1)-1

df_.drop(columns=['tipo_puntos','point'],inplace=True)
cl.reset(df_)
df__ = df_.copy()

In [302]:
df__.head()

Unnamed: 0,desc_epigrafe,id_local,id_distrito_local,desc_distrito_local,id_barrio_local,desc_barrio_local,desc_tipo_acceso_local,id_situacion_local,desc_situacion_local,clase_vial_acceso,desc_vial_acceso,nom_acceso,num_acceso,cal_acceso,id_agrupacion,nombre_agrupacion,id_tipo_agrup,desc_tipo_agrup,rotulo,id_seccion,desc_seccion,id_division,desc_division,id_epigrafe,conc,desc_sit_loc_modif_19,desc_sit_loc_modif_18,desc_sit_loc_modif_17,desc_sit_loc_modif_16,desc_sit_loc_modif_15,cerrado_19,abierto_19,cerrado_18,abierto_18,cerrado_17,abierto_17,cerrado_16,abierto_16,target,desc_act_norm,id_act_norm,ab_17_19,loc_dist_act,ab_dist_act_17_19,total_loc_act,total_ab_act_17_19,loc_dist,ab_dist_17_19,loc_na_dist,ab_dist_act_17_19_rate,total_ab_act_17_19_rate,total_ab_dist_17_19_rate,total_na_dist_rate,loc_barrio_act,ab_barrio_act_17_19,loc_barrio,ab_barrio_17_19,loc_na_barrio,ab_barrio_act_17_19_rate,total_ab_barr_17_19_rate,total_na_barr_rate,num_act,coord_x_final,coord_y_final,lat,lon,Población,Hombre,Mujeres,Densidad (Habitantes / Ha.),Edad promedio,Total Hogares,Españoles,Extranjeros,Mixtos,España fuera barrio dia laboral,Extranjero fuera barrio dia laboral,Total fuera barrio dia laboral,España fuera barrio fin semana,Extranjero fuera barrio fin semana,Total fuera barrio fin semana,Total barrio dia laboral,Total barrio fin semana,Total trabajo dia laboral,Total trabajo fin semana,total_TF_week,ratio_t_total,ratio_fb_total,ratio_b_total,dist_type,points_in_radius
0,"COMERCIO AL POR MENOR DE PASTELERIA, CONFITERI...",10000102,1,CENTRO,106,SOL,Puerta Calle,1.0,Abierto,PLAZA,PUERTA DEL SOL ...,NUM,8,,-1.0,SIN AGRUPACION,-1.0,SIN AGRUPACION,LA MALLORQUINA,G,COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACI...,47,"COMERCIO AL POR MENOR, EXCEPTO DE VEHICULOS DE...",472403,LA MALLORQUINA-PUERTA DEL SOL-8,Abierto,Abierto,Abierto,Abierto,Abierto,0,0,0,0,0,0,0,0,0,"COMERCIO AL POR MENOR DE PASTELERIA, CONFITERI...",472403,0,85,41.0,357,143.0,14344,2926,2166,48.235294,40.056022,20.398773,15.10039,8,3.0,1755,381,128,37.5,21.709402,7.293447,2,440208.59,4474240.53,40.416642,-3.704763,115198,58027,57171,257.985172,44.026383,67296,50164,8979,8153,1956512,348482,2304994,960027,169516,1129543,623007,241830,363775,110026,4773175,0.099263,0.71955,0.181187,1,16
1,"COMERCIO AL POR MENOR DE AVES, HUEVOS Y CAZA S...",10000162,1,CENTRO,104,JUSTICIA,Agrupado,1.0,Abierto,CALLE,BARCELO ...,NUM,6,,99000056.0,MERCADO MUNICIPAL DE BARCELO,12.0,Mercado Municipal,POLLERIAS RODRIGUEZ & MARTINEZ,G,COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACI...,47,"COMERCIO AL POR MENOR, EXCEPTO DE VEHICULOS DE...",472206,POLLERIAS RODRIGUEZ & MARTINEZ-BARCELO-6,Abierto,Abierto,Abierto,Abierto,No disponible,0,0,0,0,0,0,0,1,0,"COMERCIO AL POR MENOR DE AVES, HUEVOS Y CAZA S...",472206,0,48,14.0,826,147.0,14344,2926,2166,29.166667,17.79661,20.398773,15.10039,6,0.0,2086,528,131,0.0,25.311601,6.279962,1,440692.59,4475369.53,40.426848,-3.699164,115198,58027,57171,257.985172,44.026383,67296,50164,8979,8153,1956512,348482,2304994,960027,169516,1129543,623007,241830,363775,110026,4773175,0.099263,0.71955,0.181187,1,5
2,COMERCIO AL POR MENOR DE CARNICERIA,10000224,1,CENTRO,102,EMBAJADORES,Agrupado,1.0,Abierto,CALLE,EMBAJADORES ...,NUM,41,,99000057.0,MERCADO MUNICIPAL DE SAN FERNANDO,12.0,Mercado Municipal,SIN ROTULO,G,COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACI...,47,"COMERCIO AL POR MENOR, EXCEPTO DE VEHICULOS DE...",472201,SIN ROTULO-EMBAJADORES-41,Abierto,No disponible,No disponible,No disponible,No disponible,0,1,0,0,0,0,0,0,0,COMERCIO AL POR MENOR DE CARNICERIA,472201,1,80,26.0,1125,224.0,14344,2926,2166,32.5,19.911111,20.398773,15.10039,33,10.0,3977,697,810,30.30303,17.525773,20.367111,1,440295.59,4473246.53,40.407694,-3.703645,115198,58027,57171,257.985172,44.026383,67296,50164,8979,8153,1956512,348482,2304994,960027,169516,1129543,623007,241830,363775,110026,4773175,0.099263,0.71955,0.181187,1,20
3,COMERCIO AL POR MENOR DE CARNICERIA,10000226,1,CENTRO,102,EMBAJADORES,Agrupado,1.0,Abierto,CALLE,DUQUE DE FERNAN NUÑEZ ...,NUM,4,,99000065.0,MERCADO MUNICIPAL ANTON MARTIN,12.0,Mercado Municipal,CARNICERIA M LOPEZ,G,COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACI...,47,"COMERCIO AL POR MENOR, EXCEPTO DE VEHICULOS DE...",472201,CARNICERIA M LOPEZ-DUQUE DE FERNAN NUÑEZ-4,Abierto,Abierto,Abierto,Abierto,Abierto,0,0,0,0,0,0,0,0,0,COMERCIO AL POR MENOR DE CARNICERIA,472201,0,80,26.0,1125,224.0,14344,2926,2166,32.5,19.911111,20.398773,15.10039,33,10.0,3977,697,810,30.30303,17.525773,20.367111,1,440706.59,4473659.52,40.411444,-3.69884,115198,58027,57171,257.985172,44.026383,67296,50164,8979,8153,1956512,348482,2304994,960027,169516,1129543,623007,241830,363775,110026,4773175,0.099263,0.71955,0.181187,1,18
4,COMERCIO AL POR MENOR DE CARNICERIA,10000238,1,CENTRO,102,EMBAJADORES,Agrupado,1.0,Abierto,CALLE,DUQUE DE FERNAN NUÑEZ ...,NUM,4,,99000065.0,MERCADO MUNICIPAL ANTON MARTIN,12.0,Mercado Municipal,CARNICERIA MARIANO ARAUJO,G,COMERCIO AL POR MAYOR Y AL POR MENOR; REPARACI...,47,"COMERCIO AL POR MENOR, EXCEPTO DE VEHICULOS DE...",472201,CARNICERIA MARIANO ARAUJO-DUQUE DE FERNAN NUÑEZ-4,Abierto,Abierto,Abierto,No disponible,No disponible,0,0,0,0,0,1,0,0,0,COMERCIO AL POR MENOR DE CARNICERIA,472201,1,80,26.0,1125,224.0,14344,2926,2166,32.5,19.911111,20.398773,15.10039,33,10.0,3977,697,810,30.30303,17.525773,20.367111,1,440706.59,4473659.52,40.411444,-3.69884,115198,58027,57171,257.985172,44.026383,67296,50164,8979,8153,1956512,348482,2304994,960027,169516,1129543,623007,241830,363775,110026,4773175,0.099263,0.71955,0.181187,1,18


In [303]:
df__.to_csv('Data/censolocales/locals_short.csv',index=None, header=True)

In [304]:
df__.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10709 entries, 0 to 10708
Data columns (total 91 columns):
desc_epigrafe                          10709 non-null object
id_local                               10709 non-null int64
id_distrito_local                      10709 non-null int64
desc_distrito_local                    10709 non-null object
id_barrio_local                        10709 non-null int64
desc_barrio_local                      10709 non-null object
desc_tipo_acceso_local                 10709 non-null object
id_situacion_local                     10709 non-null float64
desc_situacion_local                   10709 non-null object
clase_vial_acceso                      10709 non-null object
desc_vial_acceso                       10709 non-null object
nom_acceso                             10709 non-null object
num_acceso                             10709 non-null int64
cal_acceso                             10709 non-null object
id_agrupacion                          1