# GEOCOVID - Geocode tests

In [23]:
#LIBRARIES
#Basic
import pandas as pd
import unicodedata
import numpy as np
import difflib
from pandarallel import pandarallel
import os
#Spatial
import geopandas as gpd
from shapely.geometry import Point

## Functions

In [24]:
#Function to remove accents on string
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

In [25]:
#Function to prepare RegBL to geocoding
def regbl_wrangling(regbl_dat):
    
    #Remove missing addresses
    print('Number of missing addresses: ',regbl_dat[regbl_dat.STRNAME.isna()].shape[0],'(',round(regbl_dat[regbl_dat.STRNAME.isna()].shape[0]*100/regbl_dat.shape[0],2),'%)')
    regbl_dat=regbl_dat[~regbl_dat.STRNAME.isna()]
    print('Missing addresses were removed from the dataset.')
    
    #Remove accents
    regbl_dat['STRNAME']=regbl_dat.STRNAME.map(strip_accents)
    
    #Convert to street and municipality upper case
    regbl_dat['STRNAME']=regbl_dat.STRNAME.map(str.upper)
    regbl_dat['GDENAME']=regbl_dat.GDENAME.map(str.upper)
    
    #Select only essential columns
    regbl_dat=regbl_dat[['EGID','STRNAME','DEINR','PLZ4','GDENAME','GKODE','GKODN']]
    
    return regbl_dat

In [26]:
#Function to extract the street number from an address
    #The algorithm splits with Regex the address field when the first digit appears.
    #Thus, Chemin de Montelly 1 -> ['Chemin de Montelly','1'] and Avenue de Morges 9b -> ['Avenue de Morges','9b']
    #Addresses that are not in the standard format (e.g. 24, grande rue) will not be catched however, but the proportion is small
    #If there is no digit (e.g. EMS de l'Ours) or if the address is not in a standard format (see above), the split will return only one element (i.e. the entire address)
    #In this case, the algorithm will return a NaN value.
    #On the other hand, the algorithnm will return the 2nd part of the split (i.e. street number)
def extract_deinr(x):
    split=x.rue.str.split(pat="(?<=[a-zA-Z])\\s*(?=[0-9])",expand=True)
    if split.shape[1]>1:
        return split[1]
    else:
        return np.nan 

In [27]:
#Function to return centroids of the npa
def npa_centroid(ville,cp):
    try: #match with name + PLZ
        e=npa[(npa.Ortschaftsname==ville) & (npa.PLZ==cp)].E.values[0]
        n=npa[(npa.Ortschaftsname==ville) & (npa.PLZ==cp)].N.values[0]
    except: #take the first PLZ of the list (reason: no match with the name)
        e=npa[npa.PLZ==cp].E.values[0]
        n=npa[npa.PLZ==cp].N.values[0]  
    return e,n 

## Import adresses

### VD

In [28]:
vd_addr=pd.read_csv('../data/RegBL/VD.csv', delimiter=';')
vd_addr.shape
vd_addr.head(5)

Unnamed: 0,EGID,EDID,GDEKT,GDENR,GDENAME,STRNAME,DEINR,PLZ4,PLZZ,PLZNAME,GKODE,GKODN,STRSP
0,780001,0,VD,5582,Cheseaux-sur-Lausanne,Rue de la Mèbre,1,1033,0,Cheseaux-sur-Lausanne,2536244.885,1159433.015,fr
1,780002,0,VD,5582,Cheseaux-sur-Lausanne,Rue de la Mèbre,3,1033,0,Cheseaux-sur-Lausanne,2536254.735,1159419.614,fr
2,780003,0,VD,5582,Cheseaux-sur-Lausanne,Rue de la Mèbre,2,1033,0,Cheseaux-sur-Lausanne,2536177.835,1159417.614,fr
3,780004,0,VD,5582,Cheseaux-sur-Lausanne,Rue de la Mèbre,4,1033,0,Cheseaux-sur-Lausanne,2536207.835,1159387.813,fr
4,780005,0,VD,5582,Cheseaux-sur-Lausanne,Rue de la Mèbre,6,1033,0,Cheseaux-sur-Lausanne,2536228.236,1159366.513,fr


In [29]:
#wrangling
vd_addr=regbl_wrangling(vd_addr)

Number of missing addresses:  527 ( 0.32 %)
Missing addresses were removed from the dataset.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regbl_dat['STRNAME']=regbl_dat.STRNAME.map(strip_accents)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regbl_dat['STRNAME']=regbl_dat.STRNAME.map(str.upper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regbl_dat['GDENAME']=regbl_dat.GDENAME.map(str.upper)


### NPA

In [30]:
npa=pd.read_csv('../data/NPA/PLZO_CSV_LV95.csv', delimiter=';',encoding='iso-8859-1')
npa.shape
npa.head(5)

Unnamed: 0,Ortschaftsname,PLZ,Zusatzziffer,Gemeindename,BFS-Nr,Kantonskürzel,E,N,Sprache
0,Aeugst am Albis,8914,0,Aeugst am Albis,1,ZH,2679435.817,1235653.185,de
1,Aeugstertal,8914,2,Aeugst am Albis,1,ZH,2679815.475,1237410.215,de
2,Zwillikon,8909,0,Affoltern am Albis,2,ZH,2675104.519,1237890.488,de
3,Affoltern am Albis,8910,0,Affoltern am Albis,2,ZH,2676439.358,1236966.371,de
4,Bonstetten,8906,0,Bonstetten,3,ZH,2677798.571,1241024.843,de


In [31]:
#Remove accents in municipalities
npa['Ortschaftsname']=npa.Ortschaftsname.map(strip_accents)
#Convert municipalities to upper case
npa['Ortschaftsname']=npa.Ortschaftsname.map(str.upper)
#Select only essential columns
npa=npa[['Ortschaftsname','PLZ','E','N']]

### CANTONS

In [32]:
cantons=gpd.read_file("../data/SWISSBOUNDARIES2018/swissBOUNDARIES2D_KANTONSGEBIET.shp")
#Rename columns with appropriate names (lowercase)
cantons.columns=['uuid','date_modif','date_creat','data_yr_creat','data_mth_creat','data_yr_verif','data_mth_verif','modif','source','data_yr_upd','data_mth_upd','admin_level','quality','country_code','num','lake_area','area','part','name','nb_hab','geometry']
#Keep only Swiss features
cantons=cantons[cantons.country_code=='CH']

## Institutions

In [33]:
#Create file with addresses of insitutions
institutions_dat=[['ECOLE HOTELIERE','ROUTE DE COJONNEX','18','1000','LAUSANNE'],
                 ['EPO','CHEMIN DES PAQUERETS','9','1350','ORBE'], 
                 ['P.A. FOYER EVAM','ROUTE DE CHAVANNES','33','1007','LAUSANNE'],
                 ['P.A. FOYER EVAM','ROUTE DE RECULAN','8','1024','ECUBLENS'],
                 ['P.A. FOYER EVAM','CHEMIN CHARMEUR','8','1023','CRISSIER'],
                 ['P.A. FOYER EVAM',"CHEMIN DE L'ECLUSE",'3','1880','BEX'],
                 ['P.A. FOYER EVAM',"ROUTE DE CORNEAUX",'13','1832','CHAMBY'],
                 ['P.A. FOYER EVAM',"CHEMIN DE GRAND-VENNES",'6','1066','EPALINGES'],
                 ['P.A. FOYER EVAM',"AVENUE DU CHABLAIS",'49','1000','LAUSANNE'],
                 ['P.A. FOYER EVAM',"CHEMIN D'ENTRE-BOIS",'2b','1018','LAUSANNE'],
                 ['P.A. FOYER EVAM',"AVENUE DE VALMONT",'32','1010','LAUSANNE'],
                 ['P.A. FOYER EVAM',"ROUTE DES QUATRE CHALETS",'11','1854','LEYSIN'],
                 ['P.A. FOYER EVAM',"RUE DE L'INDUSTRIE",'11','1450','SAINTE-CROIX'],
                 ['P.A. FOYER EVAM',"AVENUE DU GENERAL-GUISAN",'62','1800','VEVEY'],
                 ['P.A. FOYER EVAM',"RUE DE MONTAGNY",'27','1400','YVERDON-LES-BAINS'],
                 ['ECOLE LE CHATELARD','ROUTE DES NARCISSES','80','1833','MONTREUX'],
                 ["FONDATION DE L'ORME",'ROUTE DES PLAINES-DU-LOUP','4a','1018','LAUSANNE'],
                 ['FONDATION CLEMENCE','AVENUE DE MORGES','64','1004','LAUSANNE'],
                 ['FONDATION LA ROZAVERE','CHEMIN DE ROVEREAZ','23','1012','LAUSANNE'],
                 ['FONDATION MONT-CALME','RUE DU BUGNON','15','1005','LAUSANNE'],
                 ['FONDATION EBEN-HEZER','CHEMIN DE ROVEREAZ','25','1012','LAUSANNE'],
                 ['EMS LA FAVERGE','ROUTE DE BULLE','10','1610','ORON-LA-VILLE'],
                 ['FONDATION LES BAUMETTES','AVENUE DES BAUMETTES','120','1020','RENENS'],
                 ['EMS LA PAIX DU SOIR','CHEMIN DE LONGERAIE','9','1052','LE-MONT-SUR-LAUSANNE'],
                 ['DOMAINE DE LA GRACIEUSE','CHEMIN DES VIGNES','14','1027','LONAY'],
                 ['EMS LE MARRONNIER','ROUTE DE LAVAUX','20','1095','LUTRY'],
                 ['EMS CHANTEMERLE','CHEMIN DE CHANTEMERLE','3','1010','LAUSANNE'],
                 ['EMS CLAIR-SOLEIL','ROUTE DE LA PIERRE','7','1024','ECUBLENS'],
                 ['EMS BOIS-GENTIL','CHEMIN DU PETIT FLON','49','1018','LAUSANNE'],
                 ['FONDATION BOIS-GENTIL','CHEMIN DU PETIT FLON','49','1018','LAUSANNE'],
                 ["EMS L'ESCAPADE",'RUE DU MARTINET','2','1188','GIMEL'],
                 ['EMS LA CLAIRIERE','ROUTE DE SAINT-CERGUE','11','1295','MIES'],
                 ['EMS LE HOME LES PINS SA','AVENUE DES PEUPLIERS','22a','1009','PULLY'],
                 ['EMS LE SIGNAL','ROUTE DU SIGNAL','6','1080','LES CULLAYES'],
                 ['EMS LA CHOCOLATIERE','RUE DE LA CHOCOLATIERE','30','1026','ECHANDENS'],
                 ['EMS LES LYS','RUE DES METIERS','3','1008','PRILLY'], 
                 ['EMS PRIMEROCHE','RUE DES METIERS','3','1008','PRILLY'], 
                 ['EMS PARC DE BEAUSOBRE','AVENUE DE CHANEL','26','1110','MORGES'],
                 ['FONDATION BOISSONNET','CHEMIN DE BOISSONNET','51','1010','LAUSANNE'],
                 ['FONDATION DE VERNAND','CHEMIN DE PRAZ-LAU','5','1033','CHESEAUX-SUR-LAUSANNE'],
                 ['FONDATION DU MIDI','CHEMIN DU MIDI','2','1260','NYON'],
                 ['FONDATION DU LEVANT','CHEMIN DU LEVANT','159','1005','LAUSANNE'],
                 ['FOYER DES AUBÉPINES','CHEMIN DES AUBEPINES','4','1004','LAUSANNE']]
institutions=pd.DataFrame(institutions_dat,columns=['name','strname','deinr','cp','ville'])
institutions['cp']=institutions.cp.astype(int)

## Import COVID data

In [34]:
#Load file
covid_tests=pd.read_csv('../data/COVID/200909_Covid-19_VD.csv',delimiter=';',encoding='iso-8859-1')
covid_tests.shape

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(41354, 18)

In [35]:
#Rename columns
covid_tests.columns=['id_demande','id_patient','sexe','date_naissance','age','id_sejour','date_prelevement','date_reception','rue','cp','ville','canton_pays','prel','res_cov','res_inf_A','res_inf_B','inf','res_rsv']

In [36]:
covid_tests.dtypes

id_demande           object
id_patient           object
sexe                 object
date_naissance       object
age                  object
id_sejour            object
date_prelevement     object
date_reception       object
rue                  object
cp                   object
ville                object
canton_pays          object
prel                 object
res_cov               int64
res_inf_A            object
res_inf_B            object
inf                 float64
res_rsv              object
dtype: object

### Data cleaning

In [37]:
#Remove incorrect data (cf. mail with G.Greub)
print('Number of incorrect tests: (cf. mail with G.Greub) ', covid_tests[covid_tests.res_inf_B=='I'].shape[0])
covid_tests=covid_tests[covid_tests.res_inf_B!='I']
print('Rows were removed from the dataset')

Number of incorrect tests: (cf. mail with G.Greub)  5
Rows were removed from the dataset


In [38]:
#Remove patients that did not indicate their address
print('Number of patients that did not give their address: ', covid_tests[covid_tests.ville=='*** ADRESSE PAS FOURNIE'].shape[0], '(',round((covid_tests[covid_tests.ville=='*** ADRESSE PAS FOURNIE'].shape[0]*100)/covid_tests.shape[0],2),'%)')
print('We also checked that these people had not given their address during a previous stay.')
covid_tests=covid_tests[covid_tests.ville!='*** ADRESSE PAS FOURNIE']
print('Rows were removed from the dataset')

Number of patients that did not give their address:  1381 ( 3.34 %)
We also checked that these people had not given their address during a previous stay.
Rows were removed from the dataset


In [39]:
#Fill manually missing canton_pays info.
#covid_tests.canton_pays.unique()
covid_tests.loc[34,'canton_pays']='VD-CH'
covid_tests.loc[2271,'canton_pays']='VD-CH'
covid_tests.loc[9719,'canton_pays']='-US'
covid_tests.loc[15629,'canton_pays']='VD-CH'
covid_tests.loc[18232,'canton_pays']='VD-CH'
covid_tests.loc[25584,'canton_pays']='NE-CH'
#Number of missing canton_pays (should be equal to 0)
print('Missing country/canton information (should be equal to 0, if not, fill manually missing values): ',covid_tests[covid_tests.canton_pays=='-'].shape[0])

Missing country/canton information (should be equal to 0, if not, fill manually missing values):  0


In [40]:
#Number of tests per canton_pays
print(covid_tests.groupby(by='canton_pays').count()['id_demande'].to_string())

canton_pays
-00         19
-AE          6
-AL          2
-AR          1
-AT          2
-BA          1
-BE         12
-BF          2
-BG          9
-BJ          1
-BR          6
-BY          1
-CA          5
-CH      16482
-CN          5
-DE          8
-DO          1
-EC          2
-EE          1
-ES         10
-FR        526
-GB          3
-GR         14
-HK          2
-HR          1
-IT         56
-KZ          4
-LB          8
-LT         11
-LU          1
-LV          1
-MA          1
-MC          1
-MD          4
-MK          1
-MX          2
-NL          1
-NO          2
-PL          1
-PT         12
-RO         18
-RS          2
-RU         35
-SG          1
-SI          3
-TR          2
-UA          7
-US          7
-XZ          3
AG-CH        5
AR-CH        1
BE-CH      184
BL-CH        4
BS-CH        6
FR-CH      743
GE-CH      162
GR-CH        3
JU-CH       34
LU-CH        1
NE-CH      927
NW-CH        1
SG-CH        4
SH-CH        1
SO-CH        3
SZ-CH        7
TI-CH       2

In [41]:
#Keep only patients living in Switzerland
print('Number of patients that did not live in Switzerland: ', covid_tests[~covid_tests.canton_pays.str.endswith('-CH')].shape[0], '(',round((covid_tests[~covid_tests.canton_pays.str.endswith('-CH')].shape[0]*100)/covid_tests.shape[0],2),'%)')
covid_tests=covid_tests[covid_tests.canton_pays.str.endswith('-CH')]
print('Rows were removed from the dataset')

Number of patients that did not live in Switzerland:  824 ( 2.06 %)
Rows were removed from the dataset


In [42]:
#Remove homeless patients
print('Number of homeless patients: ', covid_tests[covid_tests.cp=='1'].shape[0], '(',round((covid_tests[covid_tests.cp=='1'].shape[0]*100)/covid_tests.shape[0],2),'%)')
covid_tests=covid_tests[covid_tests.cp!='1']
print('Rows were removed from the dataset')

Number of homeless patients:  10 ( 0.03 %)
Rows were removed from the dataset


### Data wrangling

In [43]:
#Convert postal code to integer
covid_tests['cp']=covid_tests.cp.astype(int)

In [44]:
#Replace heterogeneity of a few street names
covid_tests.loc[covid_tests.rue=='P.A FOYER EVAM','rue']='P.A. FOYER EVAM'
covid_tests.loc[covid_tests.rue=='EMS MONT CALME','rue']='EMS MONT-CALME'
covid_tests.loc[covid_tests.rue=='FONDATION ROZAVERE','rue']='FONDATION LA ROZAVERE'
covid_tests.loc[(covid_tests.rue=='.')|(covid_tests.rue=='-'),'rue']=np.nan
covid_tests.loc[(covid_tests.rue.str.contains('EMS CLAIR-SOLEIL')==True) | (covid_tests.rue.str.contains('EMS CLAIR SOLEIL')==True),['rue','cp','ville']]=['EMS CLAIR-SOLEIL',1024,'ECUBLENS']

In [45]:
def prepare_for_geocoding(df):
    
    #Extract address number in new column
    df=df.assign(deinr=lambda x: extract_deinr(x))
    
    #Remove space between letter and digit and convert to lower case (41 A -> 41a)
    df['deinr']=df.deinr.str.replace(" ","").str.lower()
    print('Number of rows without deinr: ', df[df.deinr.isna()].shape[0])
    
    #Extract the street name from the address. 
    #Using the Regex, it corresponds to all non digits characters
    df=df.assign(strname=lambda x: x.rue.str.extract('(\D+)',expand=True))
    print('Number of rows without strname: ', df[df.strname.isna()].shape[0])

    #Remove leading and trailing characters (whitespace)
    df=df.assign(strname=lambda x: x.strname.str.strip())
    
    return df

In [46]:
covid_tests=prepare_for_geocoding(covid_tests)

Number of rows without deinr:  3149
Number of rows without strname:  67


In [47]:
#Modify a few addresses
covid_tests.loc[covid_tests.strname.str.contains('RUMINE') & (covid_tests.cp==1005),'strname']='AVENUE GABRIEL-DE-RUMINE'
covid_tests.loc[covid_tests.strname.str.contains('HARPE') & (covid_tests.cp==1007),'strname']='AVENUE FREDERIC-CESAR-DE-LA-HARPE'
covid_tests.loc[covid_tests.strname.str.contains('CHANDIEU') & (covid_tests.cp==1006),'strname']='CHEMIN ANTOINE-DE-CHANDIEU'
covid_tests.loc[covid_tests.strname.str.contains('RECORDON') & (covid_tests.cp==1004),'strname']='AVENUE FREDERIC-RECORDON'
covid_tests.loc[covid_tests.strname.str.contains('DRUEY') & (covid_tests.cp==1018),'strname']='AVENUE HENRI-DRUEY'
covid_tests.loc[covid_tests.strname.str.contains('SERVAN') & (covid_tests.cp==1006),'strname']='AVENUE ANTOINE-MICHEL-SERVAN'
covid_tests.loc[(covid_tests.strname=='CHEMIN DES DIABLERETS') & (covid_tests.deinr=='3bis') & (covid_tests.cp==1012),'deinr']='3b'
covid_tests.loc[(covid_tests.strname=='CHEMIN DU DEVIN') & (covid_tests.deinr=='78bis') & (covid_tests.cp==1012),'deinr']='78b'
covid_tests.loc[(covid_tests.strname=='CHEMIN DU DEVIN') & (covid_tests.deinr=='57bis') & (covid_tests.cp==1012),'deinr']='57b'
covid_tests.loc[(covid_tests.strname=='AVENUE DE BEAUMONT') & (covid_tests.deinr=='60bis') & (covid_tests.cp==1010),'deinr']='60b'

### Extract subsets for patients located in the canton of Vaud

In [71]:
#Covid tests for the canton of Vaud
print('Number of Covid tests for the Canton of Vaud: ',covid_tests[covid_tests.cp.isin(vd_addr.PLZ4)].shape[0],'(',round((covid_tests[covid_tests.cp.isin(vd_addr.PLZ4)].shape[0]*100)/covid_tests.shape[0],2),')')
covid_vd=covid_tests[covid_tests.cp.isin(vd_addr.PLZ4)]
print('Subset saved in new dataframe covid_vd')

Number of Covid tests for the Canton of Vaud:  33753 ( 86.25 )
Subset saved in new dataframe covid_vd


## Geocoding

In [72]:
def covid_coord(row,addr):
    
    if pd.isnull(row.rue):
        e,n=npa_centroid(row.ville,row.cp)
        note_geocoding='Geocoded at NPA centroid. No street address.'
        
    elif pd.isnull(row.deinr):
        #Remove accents
        row.rue=strip_accents(row.rue)
        
        if ((institutions['name']==row.rue) & (institutions['cp']==row.cp)).any()==True:
            try:
                institution_addr=institutions[institutions.name==row.rue]
                e=addr[(addr.STRNAME==institution_addr.strname.values[0]) & (addr.DEINR==institution_addr.deinr.values[0]) & (addr.PLZ4==institution_addr.cp.values[0])].GKODE.values[0]
                n=addr[(addr.STRNAME==institution_addr.strname.values[0]) & (addr.DEINR==institution_addr.deinr.values[0]) & (addr.PLZ4==institution_addr.cp.values[0])].GKODN.values[0]
                note_geocoding='Geocoded at building. Institution.'
            except:
                e,n=npa_centroid(row.ville,row.cp)
                note_geocoding='Geocoded at NPA centroid. No match on RegBL for this institution.'
                
        elif row.rue.find('C/')!=-1:
            e,n=npa_centroid(row.ville,row.cp)
            note_geocoding='Geocoded at NPA centroid. C/O address.'
        
        else:
            e,n=npa_centroid(row.ville,row.cp)
            note_geocoding='Geocoded at NPA centroid. No street number (possible improvement: search for institution or identify street).'
    
    else:
        #Remove accents
        row.strname=strip_accents(row.strname)
        row.strname=row.strname.upper()
        
        try:
            e=addr[(addr.STRNAME==row.strname) & (addr.DEINR==row.deinr) & (addr.PLZ4==row.cp)].GKODE.values[0]
            n=addr[(addr.STRNAME==row.strname) & (addr.DEINR==row.deinr) & (addr.PLZ4==row.cp)].GKODN.values[0]
            note_geocoding='Geocoded at building.'
        
        except:
            try:
                fuzzy_rue=difflib.get_close_matches(row.strname, addr[addr.PLZ4==row.cp].STRNAME,1,0.5)[0]
                #print(fuzzy_rue)
                e=addr[(addr.STRNAME==fuzzy_rue) & (addr.DEINR==row.deinr) & (addr.PLZ4==row.cp)].GKODE.values[0]
                n=addr[(addr.STRNAME==fuzzy_rue) & (addr.DEINR==row.deinr) & (addr.PLZ4==row.cp)].GKODN.values[0]
                note_geocoding='Geocoded at building. Fuzzy matching.'
                
            except:
                if len(row.deinr)>6: #max is 520bis
                    try:
                        addr['full_strname']=addr.STRNAME + " " + addr.DEINR
                        e=addr[(addr.full_strname==row.rue) & (addr.PLZ4==row.cp)].GKODE.values[0]
                        n=addr[(addr.full_strname==row.rue) & (addr.PLZ4==row.cp)].GKODN.values[0]
                        note_geocoding='Geocoded at building.'
                    except:
                        e,n=npa_centroid(row.ville,row.cp)
                        note_geocoding='Geocoded at NPA centroid. No match found.'
                else:
                    e,n=npa_centroid(row.ville,row.cp)
                    note_geocoding='Geocoded at NPA centroid. No match found.'
                
    
    return e,n,note_geocoding

In [73]:
#Initialize parallel processing
pandarallel.initialize(nb_workers=11)

INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [74]:
#Run function
%time covid_vd['gkode'],covid_vd['gkodn'],covid_vd['note_geocoding']=zip(*covid_vd.parallel_apply(lambda row: covid_coord(row,vd_addr),axis=1))

CPU times: user 142 ms, sys: 80.5 ms, total: 222 ms
Wall time: 5min 24s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [75]:
covid_vd.shape

(33753, 23)

## Finalize geocoding

In [79]:
#Convert dataframe to GeoDataframe
#Create a geometry column using Shapely
covid_vd=covid_vd.assign(geometry=covid_vd.apply(lambda row: Point(row.gkode, row.gkodn),axis=1))

#Convert to geodataframe
covid_vd=gpd.GeoDataFrame(covid_vd, geometry=covid_vd.geometry, crs={'init': 'epsg:2056'})

  return _prepare_from_string(" ".join(pjargs))


In [80]:
#Find tests geocoded oustide the cantons
vd=cantons[cantons.name=='Vaud'].geometry.unary_union
outside_vd=covid_vd[~covid_vd.geometry.within(vd)]

In [81]:
#Remove tests that are outside VD
# when we filtered covid tests, we used PLZ4 info, but several postal codes are overlapping two cantons (e.g. in Couvent, Versoix)
covid_vd=covid_vd[~covid_vd.id_demande.isin(outside_vd.id_demande)]

In [83]:
#Add lat/lon coordinates in new columns
covid_vd['lon']=covid_vd.to_crs({'init': 'epsg:4326'}).geometry.x
covid_vd['lat']=covid_vd.to_crs({'init': 'epsg:4326'}).geometry.y
    
path='../outputs/covid_vd.gpkg'
    
try:
    if os.path.exists(path):
        os.remove(path)
    covid_vd.to_file(path,layer='covid_vd',driver='GPKG')
    print('Sucess')
except:
    print('Error while saving data on disk')

Sucess


In [84]:
#Statistics about geocoding
covid_vd.groupby(by='note_geocoding').count()['id_demande']

note_geocoding
Geocoded at NPA centroid. C/O address.                                                                             802
Geocoded at NPA centroid. No match found.                                                                         1375
Geocoded at NPA centroid. No match on RegBL for this institution.                                                   32
Geocoded at NPA centroid. No street address.                                                                        48
Geocoded at NPA centroid. No street number (possible improvement: search for institution or identify street).     1414
Geocoded at building.                                                                                            19229
Geocoded at building. Fuzzy matching.                                                                            10352
Geocoded at building. Institution.                                                                                 399
Name: id_demande, dtype: int64

In [85]:
covid_vd.shape

(33651, 26)

In [56]:
33651-29980

3671

In [57]:
100-89.09

10.909999999999997

In [50]:
19229+399+10352

29980

In [55]:
(399*100)/29980

1.3308872581721147

In [60]:
(1414*100)/3671

38.51811495505312