## Padron Comunidad de Madrid

In [1]:
#Create a spark context
import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CSV to Dataset").getOrCreate()

In [2]:
spark

In [3]:
#Read csv file
df = spark.read.options(header=True, delimiter=';', quota='', inferSchema=True) \
    .csv('padron_madrid/Rango_Edades_Seccion_202011.csv')
df.printSchema()
df.show(5)

root
 |-- COD_DISTRITO: integer (nullable = true)
 |-- DESC_DISTRITO: string (nullable = true)
 |-- COD_DIST_BARRIO: integer (nullable = true)
 |-- DESC_BARRIO: string (nullable = true)
 |-- COD_BARRIO: integer (nullable = true)
 |-- COD_DIST_SECCION: integer (nullable = true)
 |-- COD_SECCION: integer (nullable = true)
 |-- COD_EDAD_INT: integer (nullable = true)
 |-- EspanolesHombres: integer (nullable = true)
 |-- EspanolesMujeres: integer (nullable = true)
 |-- ExtranjerosHombres: integer (nullable = true)
 |-- ExtranjerosMujeres: integer (nullable = true)

+------------+--------------------+---------------+--------------------+----------+----------------+-----------+------------+----------------+----------------+------------------+------------------+
|COD_DISTRITO|       DESC_DISTRITO|COD_DIST_BARRIO|         DESC_BARRIO|COD_BARRIO|COD_DIST_SECCION|COD_SECCION|COD_EDAD_INT|EspanolesHombres|EspanolesMujeres|ExtranjerosHombres|ExtranjerosMujeres|
+------------+--------------------+-

In [4]:
#Convert pyspark dataframe to pandas
df_pandas = df.toPandas()
df_pandas.head()

Unnamed: 0,COD_DISTRITO,DESC_DISTRITO,COD_DIST_BARRIO,DESC_BARRIO,COD_BARRIO,COD_DIST_SECCION,COD_SECCION,COD_EDAD_INT,EspanolesHombres,EspanolesMujeres,ExtranjerosHombres,ExtranjerosMujeres
0,1,CENTRO,101,PALACIO,1,1001,1,0,5.0,,1.0,
1,1,CENTRO,101,PALACIO,1,1001,1,1,2.0,3.0,,5.0
2,1,CENTRO,101,PALACIO,1,1001,1,2,6.0,1.0,1.0,
3,1,CENTRO,101,PALACIO,1,1001,1,3,2.0,,,3.0
4,1,CENTRO,101,PALACIO,1,1001,1,4,3.0,3.0,,


### Analysis from pandas df

In [5]:
import pandas as pd
import numpy as np

In [6]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237593 entries, 0 to 237592
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   COD_DISTRITO        237593 non-null  int32  
 1   DESC_DISTRITO       237593 non-null  object 
 2   COD_DIST_BARRIO     237593 non-null  int32  
 3   DESC_BARRIO         237593 non-null  object 
 4   COD_BARRIO          237593 non-null  int32  
 5   COD_DIST_SECCION    237593 non-null  int32  
 6   COD_SECCION         237593 non-null  int32  
 7   COD_EDAD_INT        237593 non-null  int32  
 8   EspanolesHombres    221278 non-null  float64
 9   EspanolesMujeres    231725 non-null  float64
 10  ExtranjerosHombres  104382 non-null  float64
 11  ExtranjerosMujeres  115391 non-null  float64
dtypes: float64(4), int32(6), object(2)
memory usage: 16.3+ MB


In [7]:
#Replace nan values
df_pandas.fillna(0, inplace=True)

### Group by district the information and get some metrics

In [8]:
#Group by district and for checking the metrics, transform float numbers to integer

cols = ['EspanolesHombres', 'EspanolesMujeres', 'ExtranjerosHombres', 'ExtranjerosMujeres']
df_distrito = df_pandas.groupby('DESC_DISTRITO')[cols].sum().astype(np.int32)
df_distrito

Unnamed: 0_level_0,EspanolesHombres,EspanolesMujeres,ExtranjerosHombres,ExtranjerosMujeres
DESC_DISTRITO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ARGANZUELA,64741,74135,7846,8822
BARAJAS,21958,23181,2330,2855
CARABANCHEL,94477,109155,26908,29896
CENTRO,52792,52888,19213,16641
CHAMARTIN,60142,71775,6079,8984
CHAMBERI,54547,68141,7384,9946
CIUDAD LINEAL,83900,100556,15520,18754
FUENCARRAL-EL PARDO,107416,118927,10018,13068
HORTALEZA,82401,89914,9590,12125
LATINA,92403,106689,19650,23079


In [9]:
print(df_distrito.info())

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, ARGANZUELA           to VILLAVERDE          
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   EspanolesHombres    21 non-null     int32
 1   EspanolesMujeres    21 non-null     int32
 2   ExtranjerosHombres  21 non-null     int32
 3   ExtranjerosMujeres  21 non-null     int32
dtypes: int32(4)
memory usage: 504.0+ bytes
None


In [10]:
df_distrito.agg([np.sum, np.mean]).astype(np.int32)


Unnamed: 0,EspanolesHombres,EspanolesMujeres,ExtranjerosHombres,ExtranjerosMujeres
sum,1311449,1496960,243910,281632
mean,62449,71283,11614,13411


In [11]:
df_distrito.describe()

Unnamed: 0,EspanolesHombres,EspanolesMujeres,ExtranjerosHombres,ExtranjerosMujeres
count,21.0,21.0,21.0,21.0
mean,62449.952381,71283.809524,11614.761905,13411.047619
std,21964.720719,25266.259811,6869.291622,7132.250672
min,21958.0,23181.0,2330.0,2855.0
25%,50019.0,57626.0,6079.0,8573.0
50%,57613.0,69291.0,9590.0,12264.0
75%,82401.0,89914.0,16625.0,17997.0
max,107416.0,118927.0,26908.0,29896.0


In [12]:
#New columns for total españoles and total extranjeros

df_distrito['Total_espanoles'] = df_distrito[['EspanolesHombres', 'EspanolesMujeres']].sum(axis=1)
df_distrito['Total_extranjeros'] = df_distrito[['ExtranjerosHombres', 'ExtranjerosMujeres']].sum(axis=1)

In [13]:
df_distrito

Unnamed: 0_level_0,EspanolesHombres,EspanolesMujeres,ExtranjerosHombres,ExtranjerosMujeres,Total_espanoles,Total_extranjeros
DESC_DISTRITO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ARGANZUELA,64741,74135,7846,8822,138876,16668
BARAJAS,21958,23181,2330,2855,45139,5185
CARABANCHEL,94477,109155,26908,29896,203632,56804
CENTRO,52792,52888,19213,16641,105680,35854
CHAMARTIN,60142,71775,6079,8984,131917,15063
CHAMBERI,54547,68141,7384,9946,122688,17330
CIUDAD LINEAL,83900,100556,15520,18754,184456,34274
FUENCARRAL-EL PARDO,107416,118927,10018,13068,226343,23086
HORTALEZA,82401,89914,9590,12125,172315,21715
LATINA,92403,106689,19650,23079,199092,42729


In [14]:
#Distrtibution of total population by district
df_pandas['DESC_DISTRITO'].value_counts(normalize=True)

LATINA                  0.081993
FUENCARRAL-EL PARDO     0.074371
CARABANCHEL             0.073538
PUENTE DE VALLECAS      0.072069
CIUDAD LINEAL           0.070154
SALAMANCA               0.052316
CHAMBERI                0.051281
HORTALEZA               0.049379
TETUAN                  0.048629
SAN BLAS-CANILLEJAS     0.046070
CENTRO                  0.045199
ARGANZUELA              0.044875
VILLAVERDE              0.042434
CHAMARTIN               0.042042
RETIRO                  0.038983
USERA                   0.037589
MORATALAZ               0.035898
MONCLOA-ARAVACA         0.035182
VILLA DE VALLECAS       0.026655
VICALVARO               0.018776
BARAJAS                 0.012568
Name: DESC_DISTRITO, dtype: float64

In [15]:
#Number of "población extranjera" by district
df_distrito['Total_extranjeros'].sort_values(ascending=False)

DESC_DISTRITO
CARABANCHEL             56804
PUENTE DE VALLECAS      49352
LATINA                  42729
CENTRO                  35854
USERA                   35238
CIUDAD LINEAL           34274
VILLAVERDE              34047
TETUAN                  32753
FUENCARRAL-EL PARDO     23086
SAN BLAS-CANILLEJAS     23051
SALAMANCA               21737
HORTALEZA               21715
CHAMBERI                17330
ARGANZUELA              16668
VILLA DE VALLECAS       16225
CHAMARTIN               15063
MONCLOA-ARAVACA         13943
RETIRO                  10344
MORATALAZ               10120
VICALVARO               10024
BARAJAS                  5185
Name: Total_extranjeros, dtype: int64

.-Distrito Latina es que mas población agrupa de todos con un 8%.

.-Distrito Carabanchel el que tiene mas población extranjera.