In [2]:
import xarray as xr
import numpy as np
import geopandas as gpd
import pandas as pd
import re

This notebook (script) is to geolocate the EM-DAT events from 2021 to 2023 based on the cleaned GAUL geometries from script 1 (clean_gaul.ipynb(py)).

The first part is locating the events that have the geocodes to the GAUL geometries. 

The second part is locating the events  based on the names of the geometries. For this, the names of the locations and geometries will be matched by a fuzzy algorithm, and then only the geometries that pass a certain threshold and which names exist fully in the location column of the event are kept.
In the case where geometries with similar names at admin levels 1 and 2 are found, the geometries of level 1 are favored and kept over the level 2.

At the end, all geodataframes are concatenated together and columns are created to enable their integration with the GDIS database.


In [3]:
emdat = pd.read_excel('/net/projects/xaida/raw_data/emdat_data/public_emdat_2021_2023.xlsx')


In [4]:
import pandas as pd

# Read the Excel file into emdat DataFrame and set 'DisNo.' as index
emdat = pd.read_excel('/net/projects/xaida/raw_data/emdat_data/public_emdat_2021_2023.xlsx')
emdat = emdat.set_index('DisNo.')

# Select only 'Location' and 'Admin Units' columns and drop rows where both are NaN
emdat = emdat[['Location', 'Admin Units']].dropna(how='all')

# Sort 'Location' column alphabetically
emdat['Location'] = emdat['Location'].apply(lambda x: ', '.join(sorted(str(x).split(', '))))

# Process 'Admin Units' column to get Admin1 Code, Admin2 Code, and Geo Locations
def process_admin_units(units):
    if pd.isna(units):
        return pd.Series([None, None, None])
    
    admin1_units = []
    admin2_units = []

    # Extract Admin1 and Admin2 information
    for unit in eval(units):
        if 'adm1_code' in unit:
            admin1_units.append((unit['adm1_code'], unit['adm1_name']))
        if 'adm2_code' in unit:
            admin2_units.append((unit['adm2_code'], unit['adm2_name']))
    
    # Sort admin units alphabetically
    admin1_units.sort(key=lambda x: x[1])  # Sort by adm1_name
    admin2_units.sort(key=lambda x: x[1])  # Sort by adm2_name

    admin1_codes = [str(unit[0]) for unit in admin1_units]
    admin1_names_list = [unit[1] for unit in admin1_units]
    
    admin2_codes = [str(unit[0]) for unit in admin2_units]
    admin2_names_list = [unit[1] for unit in admin2_units]
    
    # Construct Geo Locations string
    geo_location_str = ', '.join(admin1_names_list) + (' (Adm1).' if admin1_names_list else '')
    if admin2_names_list:
        geo_location_str += ' ' + ', '.join(admin2_names_list) + ' (Adm2).'
    
    return pd.Series([';'.join(admin1_codes), ';'.join(admin2_codes), geo_location_str])

# Apply the process_admin_units function to 'Admin Units' column and assign the result to Emdata
Emdata = emdat['Admin Units'].apply(process_admin_units)
Emdata.columns = ['Admin1 Code', 'Admin2 Code', 'Geo Locations']

# Merge 'Location' column from emdat into Emdata DataFrame
Emdata['Location'] = emdat['Location']

# Print the DataFrame
print(Emdata)

# Access a specific DisNo.
print(Emdata.loc['2021-0003-IDN'])


                           Admin1 Code Admin2 Code  \
DisNo.                                               
2020-0051-IRL                1592;1603               
2020-9609-SOM                     None        None   
2021-0001-BOL  40443;40445;40446;40450               
2021-0001-PRY                                23869   
2021-0002-ARG                      446               
...                                ...         ...   
2023-9589-URY                     None        None   
2023-9651-BRA                     None        None   
2023-9706-BOL                     None        None   
2023-9868-USA                     None        None   
2023-9873-FSM                     None        None   

                                          Geo Locations  \
DisNo.                                                    
2020-0051-IRL                      Laois, Sligo (Adm1).   
2020-9609-SOM                                      None   
2021-0001-BOL  Beni, Cochabamba, La Paz, Tarija (Adm1).   
20

In [254]:
print(Emdata.loc['2021-0036-MDG'])

Admin1 Code                                                       
Admin2 Code                     154596;154527;154556;154557;154530
Geo Locations     Antalaha, Maroantsetra, Toamasina I, Toamasin...
Location         Antalaha, Maroantsetra, Vavatenina and Toamasi...
Name: 2021-0036-MDG, dtype: object


In [255]:
print(Emdata.loc['2021-0016-ESP'])

Admin1 Code                                       
Admin2 Code                      25820;25782;25786
Geo Locations     Madrid, Málaga, Zaragoza (Adm2).
Location                  Madrid, Malaga, Zaragoza
Name: 2021-0016-ESP, dtype: object


In [256]:
print(Emdata.loc['2023-9167-HND'])

Admin1 Code                                    1420;1421;1433;1438
Admin2 Code                          17413;17439;17335;17287;17313
Geo Locations    Atlantida, Choluteca, Ocotepeque, Yoro (Adm1)....
Location         Catacamas, Choculeta, Flores, Jacaleapa, La Es...
Name: 2023-9167-HND, dtype: object


In [185]:
common_nans = emdat[ emdat['Location'].notna() &  emdat['Admin Units'].isna()]

# Print the result
print("Rows with common NaNs in 'Location' and 'Admin Units':")
print(common_nans)

Rows with common NaNs in 'Location' and 'Admin Units':
                                                        Location Admin Units
DisNo.                                                                      
2020-9609-SOM  Eilafwen (Sanaag), Gedo; Las Anod / Hudun and ...         NaN
2021-0012-BGD                                        Cox's Bazar         NaN
2021-0029-IND                     Surat district (Gujarat state)         NaN
2021-0030-ECU                                   Bolivar province         NaN
2021-0032-IDN                                           Java Sea         NaN
...                                                          ...         ...
2023-9589-URY  Cerro Largo, Florida, Lavalleja, San Jose, Tac...         NaN
2023-9651-BRA                                     Amazonas state         NaN
2023-9706-BOL  Chuquisaca, Cochabamba, La Paz, Oruro, Santa Cruz         NaN
2023-9868-USA  Illinois, Kansas, Louisiana, Missouri, Montana...         NaN
2023-9873-FSM        

In [5]:
#get the entires that only have location name indiction (not the geolocation name and code)
Emdata_loconly = Emdata[Emdata['Admin1 Code'].isna() & Emdata['Admin2 Code'].isna()]
#get the geocoded entries
Emdata_geocode = Emdata.drop(Emdata_loconly.index)

## 1) Matching events with geolocations (from 2021 onwards)

### geometries at Admin 1 level

In [6]:
####Administrative level 1 geometries
#Extract Adm1 locations
emdat_geocode_adm1 = Emdata_geocode.loc[Emdata_geocode['Geo Locations'].str.contains("(Adm1)")]
emdat_geocode_adm1["Geo Locations"] = emdat_geocode_adm1["Geo Locations"].str.split(" \(Adm1",expand=True)[0]
#Shape Adm1 locations: 1 row = 1 geolocation
emdat_geocode_adm1_stacked = pd.DataFrame(emdat_geocode_adm1["Admin1 Code"].str.split(';', expand=True).stack()).rename(columns={0:'ADM1_CODE'})
emdat_geocode_adm1_stacked['location'] = pd.DataFrame(emdat_geocode_adm1["Location"].str.split(',', expand=True).stack())
emdat_geocode_adm1_stacked['geolocation'] = pd.DataFrame(emdat_geocode_adm1["Geo Locations"].str.split(',', expand=True).stack())
emdat_geocode_adm1_identified = emdat_geocode_adm1_stacked.reset_index().drop(columns={'level_1'})

emdat_geocode_adm1_identified

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,DisNo.,ADM1_CODE,location,geolocation
0,2020-0051-IRL,1592,County Laois.,Laois
1,2020-0051-IRL,1603,Geevagh (County Sligo),Sligo
2,2021-0001-BOL,40443,Beni,Beni
3,2021-0001-BOL,40445,Cochabamba,Cochabamba
4,2021-0001-BOL,40446,La Paz Departments,La Paz
...,...,...,...,...
2103,2023-9167-HND,1438,Jacaleapa,Yoro
2104,2023-9234-ESP,2716,Andalusia,Andalucía
2105,2023-9234-ESP,2724,Asturias,Cataluña/Catalunya
2106,2023-9234-ESP,2718,Catalognia,Principado de Asturias


In [7]:
emdat_geocode_adm1

Unnamed: 0_level_0,Admin1 Code,Admin2 Code,Geo Locations,Location
DisNo.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-0051-IRL,1592;1603,,"Laois, Sligo","County Laois., Geevagh (County Sligo), Riverst..."
2021-0001-BOL,40443;40445;40446;40450,,"Beni, Cochabamba, La Paz, Tarija","Beni, Cochabamba, La Paz Departments, Tarija"
2021-0002-ARG,446,,San Juan,San Juan Province
2021-0004-IDN,1520,,Jawa Tengah,Central Java
2021-0005-IDN,1522;1523;1524;1525;73615;73616;73619;73620;15...,,"Kalimantan Barat, Kalimantan Selatan, Kalimant...","Kalimantan, Maluku Islands, Sulawesi Island"
...,...,...,...,...
2023-0539-CHL,889;892;149630,12971;12947,"Libertador Gral. Bernardo O'Higgins, Maule, Va...","Biobío Regions, Maule, O’Higgins, Santiago Met..."
2023-0548-IND,1502,,Mizoram,Mizoram State
2023-9167-HND,1420;1421;1433;1438,17413;17439;17335;17287;17313,"Atlantida, Choluteca, Ocotepeque, Yoro","Catacamas, Choculeta, Flores, Jacaleapa, La Es..."
2023-9234-ESP,2716;2724;2718,,"Andalucía, Cataluña/Catalunya, Principado de A...","Andalusia, Asturias, Catalognia"


In [8]:
emdat_geocode_adm1['Location'].astype(str).str.split(',').apply(len).sum()

2396

In [9]:
(emdat_geocode_adm1['Admin1 Code'].astype(str).str.split(';').apply(len) + (emdat_geocode_adm1['Admin2 Code'].astype(str).str.split(';').apply(len) - emdat_geocode_adm1['Admin2 Code'].isna())).sum()

2930

In [10]:
emdat_geocode_adm1_identified

Unnamed: 0,DisNo.,ADM1_CODE,location,geolocation
0,2020-0051-IRL,1592,County Laois.,Laois
1,2020-0051-IRL,1603,Geevagh (County Sligo),Sligo
2,2021-0001-BOL,40443,Beni,Beni
3,2021-0001-BOL,40445,Cochabamba,Cochabamba
4,2021-0001-BOL,40446,La Paz Departments,La Paz
...,...,...,...,...
2103,2023-9167-HND,1438,Jacaleapa,Yoro
2104,2023-9234-ESP,2716,Andalusia,Andalucía
2105,2023-9234-ESP,2724,Asturias,Cataluña/Catalunya
2106,2023-9234-ESP,2718,Catalognia,Principado de Asturias


In [11]:
gaul_1 = gpd.read_file('/net/projects/xaida/raw_data/gaul_maps/gaul_admin1_clean.gpkg')

In [109]:
gaul_1.columns

Index(['ADM0_CODE', 'ADM0_NAME', 'ADM1_CODE', 'ADM1_NAME', 'DISP_AREA',
       'EXP1_YEAR', 'STATUS', 'STR1_YEAR', 'Shape_Area', 'Shape_Leng',
       'official_name', 'iso3', 'geometry'],
      dtype='object')

In [12]:
gaul_1_loc = gaul_1[['ADM1_CODE','ADM1_NAME']]

In [194]:
gaul_1_loc.dtypes

ADM1_CODE     int64
ADM1_NAME    object
dtype: object

In [195]:
emdat_geocode_adm1_identified.dtypes

DisNo.         object
ADM1_CODE      object
location       object
geolocation    object
dtype: object

In [13]:
emdat_geocode_adm1_identified['ADM1_CODE'] = emdat_geocode_adm1_identified['ADM1_CODE'].astype(int)

In [14]:
emdat_geocode_adm1_identified_merged = emdat_geocode_adm1_identified.merge(gaul_1_loc, on = 'ADM1_CODE',how = 'left')

In [15]:
emdat_geocode_adm1_identified_merged

Unnamed: 0,DisNo.,ADM1_CODE,location,geolocation,ADM1_NAME
0,2020-0051-IRL,1592,County Laois.,Laois,Laois
1,2020-0051-IRL,1603,Geevagh (County Sligo),Sligo,Sligo
2,2021-0001-BOL,40443,Beni,Beni,Beni
3,2021-0001-BOL,40445,Cochabamba,Cochabamba,Cochabamba
4,2021-0001-BOL,40446,La Paz Departments,La Paz,La Paz
...,...,...,...,...,...
2107,2023-9167-HND,1438,Jacaleapa,Yoro,Yoro
2108,2023-9234-ESP,2716,Andalusia,Andalucía,Andaluc�a
2109,2023-9234-ESP,2724,Asturias,Cataluña/Catalunya,Catalu�a/Catalunya
2110,2023-9234-ESP,2718,Catalognia,Principado de Asturias,Principado de Asturias


In [18]:
emdat_geocode_adm1_identified_merged["geolocation"] = emdat_geocode_adm1_identified_merged["geolocation"].str.lstrip()
emdat_geocode_adm1_identified_merged["geolocation"] = emdat_geocode_adm1_identified_merged["geolocation"].str.rstrip()

In [19]:
emdat_geocode_adm1_identified_merged["ADM1_NAME"] = emdat_geocode_adm1_identified_merged["ADM1_NAME"].str.lstrip()
emdat_geocode_adm1_identified_merged["ADM1_NAME"] = emdat_geocode_adm1_identified_merged["ADM1_NAME"].str.rstrip()

In [20]:
emdat_geocode_adm1_identified_merged['comparison'] = emdat_geocode_adm1_identified_merged.apply(lambda x: x['geolocation'] == x['ADM1_NAME'], axis=1)

In [21]:
emdat_geocode_adm1_identified_merged[emdat_geocode_adm1_identified_merged.comparison == False]

Unnamed: 0,DisNo.,ADM1_CODE,location,geolocation,ADM1_NAME,comparison
86,2021-0071-MAR,147337,Tanger,Tanger - Tétouan,Tanger - T�touan,False
122,2021-0098-PER,2339,Junín,Junín,Jun�n,False
158,2021-0170-PER,2349,Yurimaguas District (Alto Amazonas Province,San Martín,San Mart�n,False
236,2021-0212-PER,2349,Nauta District (Loreto Province); San Martin r...,San Martín,San Mart�n,False
242,2021-0230-GTM,64823,Uspantán and Chicamán Municipalities (Quiché D...,Sacatepéquez,Sacatep�quez,False
394,2021-0300-GTM,64837,Quetzaltenango,Petén,Pet�n,False
398,2021-0300-GTM,64827,Suchitepéquez departments,Sololá,Solol�,False
399,2021-0300-GTM,64830,,Suchitepéquez,Suchitep�quez,False
422,2021-0326-GUY,1399,4,Cuyuni/mazaruni (region N°7),Cuyuni/mazaruni (region N�7),False
423,2021-0326-GUY,1400,5,Demerara Mahaica (region N°4),Demerara Mahaica (region N�4),False


In [120]:
gaul_1[gaul_1.ADM1_CODE == 2754]

Unnamed: 0,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME,DISP_AREA,EXP1_YEAR,STATUS,STR1_YEAR,Shape_Area,Shape_Leng,official_name,iso3,geometry
22,74,South Sudan,2754,Northern Bahr El Ghazal,NO,3000,Member State,1994,2.494846,7.052428,Republic of South Sudan,SSD,"POLYGON ((26.13292 9.08244, 26.13530 9.10099, ..."


In [203]:
gaul_1[gaul_1.ADM0_NAME == 'South Sudan']

Unnamed: 0,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME,DISP_AREA,EXP1_YEAR,STATUS,STR1_YEAR,Shape_Area,Shape_Leng,official_name,iso3,geometry
17,74,South Sudan,2746,El Buheyrat,NO,3000,Member State,1994,3.50362,11.235001,Republic of South Sudan,SSD,"POLYGON ((28.46691 6.47376, 28.46786 6.47947, ..."
18,74,South Sudan,2747,Unity,NO,3000,Member State,1994,3.011566,11.096973,Republic of South Sudan,SSD,"POLYGON ((28.71832 9.04250, 28.71832 9.04375, ..."
19,74,South Sudan,2748,Central Equatoria,NO,3000,Member State,1994,3.595255,11.441561,Republic of South Sudan,SSD,"POLYGON ((29.79232 4.50914, 29.79785 4.51991, ..."
20,74,South Sudan,2750,Eastern Equatoria,NO,3000,Member State,1994,5.769396,13.566082,Republic of South Sudan,SSD,"POLYGON ((31.69433 3.95444, 31.70147 3.96158, ..."
21,74,South Sudan,2751,Jonglei,NO,3000,Member State,1994,9.90779,17.315025,Republic of South Sudan,SSD,"POLYGON ((30.21263 8.91872, 30.21501 8.92348, ..."
22,74,South Sudan,2754,Northern Bahr El Ghazal,NO,3000,Member State,1994,2.494846,7.052428,Republic of South Sudan,SSD,"POLYGON ((26.13292 9.08244, 26.13530 9.10099, ..."
23,74,South Sudan,2765,Warab,NO,3000,Member State,1994,3.105859,9.67583,Republic of South Sudan,SSD,"POLYGON ((27.62626 8.28199, 27.62715 8.28258, ..."
24,74,South Sudan,2766,Western Bahr El Ghazal,NO,3000,Member State,1994,7.362645,15.653986,Republic of South Sudan,SSD,"POLYGON ((24.15070 8.39634, 24.15072 8.39642, ..."
25,74,South Sudan,2768,Western Equatoria,NO,3000,Member State,1994,6.400287,15.338082,Republic of South Sudan,SSD,"POLYGON ((26.28462 6.45896, 26.29016 6.47718, ..."
26,74,South Sudan,37021,Upper Nile,NO,3000,Member State,1994,6.447897,15.496584,Republic of South Sudan,SSD,"POLYGON ((30.74803 9.46688, 30.74923 9.53971, ..."


### geometries at Admin 2 level

In [22]:
####Administrative level 2 geometries
#Extract Adm2 locations
emdat_geocode_adm2 = Emdata_geocode[Emdata_geocode['Geo Locations'].str.contains("(Adm2)")]
#In the Geo Locations column, some rows still have admin1 geometry locations that need to be removed
#clean Adm1 mention for the admin2 locations
emdat_geocode_adm2_cadm1 = emdat_geocode_adm2[emdat_geocode_adm2['Geo Locations'].str.contains("(Adm1)")]#location 2 with location 1 mentions still
#emdat_geocode_adm2_cadm1 = emdat_geocode_adm2_cadm1.reset_index().iloc[:,1::]
emdat_geocode_adm2_cadm1["Geo Locations"] = emdat_geocode_adm2_cadm1["Geo Locations"].str.split(" \(Adm1\).",expand=True)[1].str.replace(" \(Adm2\).","")
emdat_geocode_adm2_wtadm1 = emdat_geocode_adm2[~emdat_geocode_adm2['Geo Locations'].str.contains("(Adm1)")]
emdat_geocode_adm2_wtadm1["Geo Locations"] = emdat_geocode_adm2_wtadm1["Geo Locations"].str.replace(" \(Adm2\).","")

emdat_geocode_adm2 = pd.concat([emdat_geocode_adm2_cadm1,emdat_geocode_adm2_wtadm1])

  This is separate from the ipykernel package so we can avoid doing imports until
  
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  if __name__ == "__main__":
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [23]:
emdat_geocode_adm2

Unnamed: 0_level_0,Admin1 Code,Admin2 Code,Geo Locations,Location
DisNo.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-0020-BGR,713,11862,Strumjani,"Batanovtsi, Strumiani, Tran"
2021-0022-PHL,67165,24251;24259,"Negros Occidental, Northern Samar",Negros Occidental and Northern Samar Provinces...
2021-0035-BOL,40445;40446;40448,40458,Oropeza,"Cochabamba Department, Paz Department, Potosì ..."
2021-0059-USA,3233;3244;3246;3252;3260;3261,29907,Suffolk,"Boston (Suffolk, Maine, Massachusetts), New Je..."
2021-0069-CHL,884;887;889;892;149630,12971,Santiago,"Araucanía, Maule, O’Higgins, Santiago Metropol..."
...,...,...,...,...
2023-0391-VEN,,31914,El Callao,El Callao City (Bolivar State)
2023-0451-COL,,13906,Quetame,Quetame Municipality (Cundinamarca Department)
2023-0458-ITA,,18415;18417;18420,"Padova, Treviso, Vicenza","Padoue, Vicence et Trévise regions (Venise)"
2023-0459-IND,,17777,Raigarh,"Irshalwadi Village (Raigad District, western M..."


In [24]:
emdat_geocode_adm2['Geo Locations'].astype(str).str.split(',').apply(len).sum()

1218

In [25]:
(emdat_geocode_adm2['Admin2 Code'].astype(str).str.split(';')).apply(len).sum()

1218

## 2) Matching events without geolocations, only location names (2021-2023)

In [26]:
#Shape Adm1 locations: 1 row = 1 geolocation
emdat_geocode_adm2_stacked = pd.DataFrame(emdat_geocode_adm2["Admin2 Code"].str.split(';', expand=True).stack()).rename(columns={0:'ADM2_CODE'})
emdat_geocode_adm2_stacked['location'] = pd.DataFrame(emdat_geocode_adm2["Location"].str.split(',', expand=True).stack())
emdat_geocode_adm2_stacked['geolocation'] = pd.DataFrame(emdat_geocode_adm2["Geo Locations"].str.split(',', expand=True).stack())
emdat_geocode_adm2_identified = emdat_geocode_adm2_stacked.reset_index()


##Load administrative boundries objects to merge locations with events

gaul1 = gpd.read_file('/net/projects/xaida/raw_data/gaul_maps/gaul_admin1_clean.gpkg')
gaul2 = gpd.read_file('/net/projects/xaida/raw_data/gaul_maps/gaul_admin2_clean.gpkg')

#get the admin1 geolocated events geometries #emdat_geocode_adm1_identified
emdat_geocode_adm1_identified['ADM1_CODE'] = emdat_geocode_adm1_identified['ADM1_CODE'].astype(int)
events_adm1  = emdat_geocode_adm1_identified.merge(gaul1, on='ADM1_CODE', how='left')

#get the admin2 geolocated events geometries #emdat_geocode_adm2_identified
emdat_geocode_adm2_identified['ADM2_CODE'] = emdat_geocode_adm2_identified['ADM2_CODE'].astype(int)
events_adm2  = emdat_geocode_adm2_identified.merge(gaul2, on='ADM2_CODE', how='left')

#Find the geolocations for the events of which we only have the location name #Emdata_locations_only
#this needs to be done in different steps, as we have no indication about the administrative level of the location
#also, looking up the geometries needs to take place for each country separately, to limit mismatch



In [28]:
#loading data again to work on data without geolocation

Emdata = pd.read_excel('/net/projects/xaida/raw_data/emdat_data/public_emdat_2021_2023.xlsx')
Emdata_loconly = Emdata[Emdata['Admin Units'].isna() & Emdata['Location'].notna()]


In [29]:


def split_and_clean_locations(location):
    entries = re.split(r';', location)
    cleaned_entries = []
    for entry in entries:
        if '(' in entry and ')' in entry:
            cleaned_entries.append(entry.strip())
        else:
            sub_entries = re.split(r',\s*(?![^()]*\))', entry)
            cleaned_entries.extend([sub_entry.strip() for sub_entry in sub_entries if sub_entry.strip()])
    return cleaned_entries

# Expand the DataFrame
expanded_rows = [
    [row['DisNo.'], row['Location'], loc]
    for _, row in Emdata_loconly.iterrows()
    for loc in split_and_clean_locations(row['Location'])
]

expanded_df = pd.DataFrame(expanded_rows, columns=['DisNo.', 'Full_Location_List', 'Individual_Location'])

# Clean the 'Individual_Location' column
replace_terms = ["Near", "Province", "province", "state", "City", "Regency", "provinces", "region", "Region", 
                 "district", "districts", "District", "Municipality", "municipality"]

for term in replace_terms:
    expanded_df['Individual_Location'] = expanded_df['Individual_Location'].str.replace(term, "", regex=False)

def split_text(text):
    if text.count('(') > 1:
        return re.split(r',\s*(?=\S)', re.sub(r'\),\s*(?=\S)', '),\n', text))
    return re.split(r'\),\s*(?=\S)', text)

expanded_df['Individual_Location'] = expanded_df['Individual_Location'].apply(split_text)
expanded_df = expanded_df.explode('Individual_Location', ignore_index=True)

expanded_df['Individual_Location'] = expanded_df['Individual_Location'].str.replace(' and ', ', ')

def extract_locations(row):
    locations = []
    if '(' in row:
        parts = row.split('(')
        locs = parts[0].strip().split(',')
        regions = parts[1].replace(')', '').split(',')
        for loc in locs:
            for sub_loc in loc.strip().split('/'):
                for region in regions:
                    locations.append([sub_loc.strip(), region.strip()])
    elif ',' in row:
        for loc in row.split(','):
            locations.append([loc.strip(), None])
    else:
        for loc in row.strip().split('/'):
            locations.append([loc.strip(), None])
    return locations

new_data = [
    [row['DisNo.'], row['Individual_Location'], loc[0], loc[1]]
    for _, row in expanded_df.iterrows()
    for loc in extract_locations(row['Individual_Location'])
]

new_df = pd.DataFrame(new_data, columns=['DisNo.', 'Individual_Location', 'Location_Before', 'Bracketed'])
new_df['Appended'] = new_df['Location_Before'] + new_df['Bracketed'].apply(lambda x: f" {x}" if x else "")

print(new_df)


             DisNo.             Individual_Location Location_Before Bracketed  \
0     2020-9609-SOM                            Gedo            Gedo      None   
1     2020-9609-SOM  Las Anod / Hudun, Taleb (Sool)        Las Anod      Sool   
2     2020-9609-SOM  Las Anod / Hudun, Taleb (Sool)           Hudun      Sool   
3     2020-9609-SOM  Las Anod / Hudun, Taleb (Sool)           Taleb      Sool   
4     2020-9609-SOM               Eilafwen (Sanaag)        Eilafwen    Sanaag   
...             ...                             ...             ...       ...   
2633  2023-9868-USA                         Montana         Montana      None   
2634  2023-9873-FSM                           Chuuk           Chuuk      None   
2635  2023-9873-FSM                          Kosrae          Kosrae      None   
2636  2023-9873-FSM                         Pohnpei         Pohnpei      None   
2637  2023-9873-FSM                             Yap             Yap      None   

             Appended  
0  

##Now Data with locations , no geocoded data is further divided into three columns Location_Before - Location that is present before (Bracket Admin1 location) and third column appending both location before and Brackted data 

In [30]:
#now data is further divided into two data sets ,first  dataset with  bracket data,
#second dataset is with no bracket data.


new_df_1 = new_df[new_df['Bracketed'].isnull()]
new_df_2 = new_df[~new_df['Bracketed'].isnull()]

In [37]:
new_df_1

Unnamed: 0,DisNo.,Individual_Location,Location_Before,Bracketed,Appended
0,2020-9609-SOM,Gedo,Gedo,,Gedo
11,2020-9609-SOM,Puntland,Puntland,,Puntland
12,2020-9609-SOM,Nugaal,Nugaal,,Nugaal
13,2020-9609-SOM,Galmudug,Galmudug,,Galmudug
14,2020-9609-SOM,Jubaland,Jubaland,,Jubaland
...,...,...,...,...,...
2633,2023-9868-USA,Montana,Montana,,Montana
2634,2023-9873-FSM,Chuuk,Chuuk,,Chuuk
2635,2023-9873-FSM,Kosrae,Kosrae,,Kosrae
2636,2023-9873-FSM,Pohnpei,Pohnpei,,Pohnpei


In [31]:
############################
## Functions needed for fuzzy matching of geometries by name
############################

from thefuzz import process
#function for fuzzy names matching
def fuzzy_match(
    df_left, df_right, column_left, column_right, threshold=90, limit=1):
    # Create a series
    series_matches = df_left[column_left].apply(
        lambda x: process.extract(x, df_right[column_right], limit=limit)            # Creates a series with id from df_left and column name _column_left_, with _limit_ matches per item
    )

    # Convert matches to a tidy dataframe
    df_matches = series_matches.to_frame()
    df_matches = df_matches.explode(column_left)     # Convert list of matches to rows
    df_matches[
        ['match_string', 'match_score', 'df_right_id']
    ] = pd.DataFrame(df_matches[column_left].tolist(), index=df_matches.index)       # Convert match tuple to columns
    df_matches.drop(column_left, axis=1, inplace=True)      # Drop column of match tuples

    # Reset index, as in creating a tidy dataframe we've introduced multiple rows per id, so that no longer functions well as the index
    if df_matches.index.name:
        index_name = df_matches.index.name     # Stash index name
    else:
        index_name = 'index'        # Default used by pandas
    df_matches.reset_index(inplace=True)
    df_matches.rename(columns={index_name: 'df_left_id'}, inplace=True)       # The previous index has now become a column: rename for ease of reference

    # Drop matches below threshold
    df_matches.drop(
        df_matches.loc[df_matches['match_score'] < threshold].index,
        inplace=True
    )

    return df_matches

In [32]:
#function for matching datasets uses fuzzy names matching
def match_datasets(df1, df2, col_a, col_b):
    
    df_matches = fuzzy_match(
    df1,
    df2,
    col_a,
    col_b,
    threshold=90,
    limit=1
    )

    df_output = df1.merge(
        df_matches,
        how='left',
        left_index=True,
        right_on='df_left_id'
    ).merge(
        df2,
        how='left',
        left_on='df_right_id',
        right_index=True,
        suffixes=['_df1', '_df2']
    )
    return(df_output)

In [33]:
#function for matching names with corresponding vector geometries
def match_emdat_gdis(emdat_df, gdam_df, iso,varname):
    
    ## EM-DAT preprocess
    emdat_iso = emdat_df[emdat_df["DisNo."].str.contains(iso)]
    emdat_iso = emdat_iso.rename(columns={varname:"name_join"})
    
    gdam_df = gdam_df[gdam_df["iso3"] == iso]
    #join name and alternative names to enhance accuracy
    gdam_df["name_join"] = gdam_df[[varname]].astype(str).agg(' - '.join, axis=1)
    
    if emdat_iso.empty | gdam_df.empty:
        return(gpd.GeoDataFrame())
    else:
        ####double the string to match in EM-DAT for maximal accuracy
        #emdat_iso["name_join"] = emdat_iso[['name_join','name_join']].astype(str).agg(' - '.join, axis=1)
        #Eliminate generic words related to Russian political entities that cause errors in matches
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Rep.","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Kray","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Oblast","")
        #Names of Russian regions that need to be changed
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Dagestan","Daghestan")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Karelya","Karelia")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Yevreyskaya A","Yevrey - Yevreyskaya")
        #Eliminate generic words related to Philippino political entities that cause errors in matches
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region XIII","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region XII","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region XI","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region X","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region IX","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region VIII","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region VII","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region VI","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region VI","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region V","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region III","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region II","")
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("Region I","")   
        emdat_iso["name_join"] = emdat_iso["name_join"].str.replace("region ","")
        
        
        ##GADM preprocess
        
        #Eliminate generic words related to Russian political entities that cause errors in matches
        gdam_df["name_join"] = gdam_df["name_join"].str.replace("Republic of","")
        gdam_df["name_join"] = gdam_df["name_join"].str.replace("Republic","")
        gdam_df["name_join"] = gdam_df["name_join"].str.replace("Respublika","")
        gdam_df["name_join"] = gdam_df["name_join"].str.replace("Kray","")
        gdam_df["name_join"] = gdam_df["name_join"].str.replace("Oblast","")
        
        #Elimination of string in case of no entry
        gdam_df["name_join"] = gdam_df["name_join"].str.replace("- None","")
        
        #Sort both datasets alphabetically with the name join column
        emdat_iso = emdat_iso.sort_values("name_join")
        gdam_df = gdam_df.sort_values("name_join")
        
        #Merge datasets
        merged = match_datasets(emdat_iso, gdam_df, "name_join", "name_join")
        merged = gpd.GeoDataFrame(merged)
        return(merged)

In [34]:
#function for concatenating geopandas dataframes
def concat_db(db1, db2):
    return(pd.concat([db1, db2], axis=0))

In [220]:
#Emdata_locations_only.Location = Emdata_locations_only.Location.str.replace(" Near","").replace(" and ",",").replace(" 'province)","").replace(" 'province ","").replace("Between","").replace('Isl.',"").replace(' districts',"").replace(' region',"")

In [38]:
Emdata_locations_only = new_df_1[['DisNo.','Location_Before']]

Emdata_locations_only.reset_index(drop=True, inplace=True)


iso_locations = Emdata_locations_only['DisNo.'].str.split('-',expand=True)[2]
Emdata_locations_only['iso3'] = iso_locations #add iso3 codes of every location/ event

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


## 3) Merging all geolocated events

In [39]:
#Administrative level 1
isos = np.unique(Emdata_locations_only["iso3"])#iso codes for the loop
final_db = gpd.GeoDataFrame()#empty geoDF for results

Emdata_locations_only = Emdata_locations_only.rename(columns={'Location_Before':"ADM1_NAME"})#change name of column to merge on

for iso in isos:
    db = match_emdat_gdis(Emdata_locations_only, gaul1, iso,"ADM1_NAME")
    if db.empty:
        pass
    else:
        final_db = pd.concat([final_db,db],axis=0)
        
locations_only_events_admin1 = final_db[~final_db['geometry'].isna()].reset_index().drop(columns={'index'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [40]:
locations_only_events_admin1

Unnamed: 0,DisNo.,name_join_df1,iso3_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,ADM1_CODE,...,DISP_AREA,EXP1_YEAR,STATUS,STR1_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,name_join_df2
0,2022-0264-AFG,Badakhshan,AFG,346,Badakhshan,100.0,516.0,1.0,Afghanistan,272.0,...,NO,3000.0,Member State,1000.0,4.398368,20.784230,Islamic Republic of Afghanistan,AFG,"POLYGON ((69.98554 36.78234, 69.98748 36.78352...",Badakhshan
1,2022-0264-AFG,Badghis,AFG,354,Badghis,100.0,517.0,1.0,Afghanistan,273.0,...,NO,3000.0,Member State,1000.0,2.038149,8.212745,Islamic Republic of Afghanistan,AFG,"POLYGON ((62.66683 34.99415, 62.66690 34.99676...",Badghis
2,2022-0264-AFG,Baghlan,AFG,351,Baghlan,100.0,518.0,1.0,Afghanistan,274.0,...,NO,3000.0,Member State,1000.0,1.776595,7.786293,Islamic Republic of Afghanistan,AFG,"POLYGON ((68.00412 35.13949, 68.00416 35.14001...",Baghlan
3,2022-0264-AFG,Bamyan,AFG,355,Bamyan,100.0,520.0,1.0,Afghanistan,276.0,...,NO,3000.0,Member State,1000.0,1.762777,8.587412,Islamic Republic of Afghanistan,AFG,"POLYGON ((66.29666 35.13259, 66.29699 35.13380...",Bamyan
4,2022-0514-AFG,Farah,AFG,513,Farah,100.0,521.0,1.0,Afghanistan,277.0,...,NO,3000.0,Member State,1000.0,4.718270,15.358933,Islamic Republic of Afghanistan,AFG,"POLYGON ((60.58315 33.12982, 60.58746 33.13812...",Farah
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891,2022-0569-ZWE,Manicaland,ZWE,563,Manicaland,100.0,134.0,271.0,Zimbabwe,3437.0,...,NO,3000.0,Member State,1000.0,3.062811,13.462677,Republic of Zimbabwe,ZWE,"POLYGON ((31.18835 -19.23481, 31.22039 -19.231...",Manicaland
892,2023-0719-ZWE,Masvingo,ZWE,1512,Masvingo,100.0,137.0,271.0,Zimbabwe,3441.0,...,NO,3000.0,Member State,1000.0,4.898562,12.419403,Republic of Zimbabwe,ZWE,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",Masvingo
893,2022-0569-ZWE,Masvingo,ZWE,562,Masvingo,100.0,137.0,271.0,Zimbabwe,3441.0,...,NO,3000.0,Member State,1000.0,4.898562,12.419403,Republic of Zimbabwe,ZWE,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",Masvingo
894,2023-0719-ZWE,Matabeleland South,ZWE,1511,Matabeleland South,100.0,138.0,271.0,Zimbabwe,3443.0,...,NO,3000.0,Member State,1000.0,4.720131,16.330050,Republic of Zimbabwe,ZWE,"POLYGON ((26.69159 -19.89356, 26.69162 -19.893...",Matabeleland South


#consider only geometries where the location string exists in the geometry string name of the gaul object
#events_admin1_locations_only_valid = locations_only_events_admin1[locations_only_events_admin1.apply(lambda x: x['name_join_df2'] in x['name_join_df1'], axis=1)]

In [41]:
events_admin1_locations_only_valid = locations_only_events_admin1[locations_only_events_admin1.match_score >= 90]

In [42]:
events_admin1_locations_only_valid

Unnamed: 0,DisNo.,name_join_df1,iso3_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,ADM1_CODE,...,DISP_AREA,EXP1_YEAR,STATUS,STR1_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,name_join_df2
0,2022-0264-AFG,Badakhshan,AFG,346,Badakhshan,100.0,516.0,1.0,Afghanistan,272.0,...,NO,3000.0,Member State,1000.0,4.398368,20.784230,Islamic Republic of Afghanistan,AFG,"POLYGON ((69.98554 36.78234, 69.98748 36.78352...",Badakhshan
1,2022-0264-AFG,Badghis,AFG,354,Badghis,100.0,517.0,1.0,Afghanistan,273.0,...,NO,3000.0,Member State,1000.0,2.038149,8.212745,Islamic Republic of Afghanistan,AFG,"POLYGON ((62.66683 34.99415, 62.66690 34.99676...",Badghis
2,2022-0264-AFG,Baghlan,AFG,351,Baghlan,100.0,518.0,1.0,Afghanistan,274.0,...,NO,3000.0,Member State,1000.0,1.776595,7.786293,Islamic Republic of Afghanistan,AFG,"POLYGON ((68.00412 35.13949, 68.00416 35.14001...",Baghlan
3,2022-0264-AFG,Bamyan,AFG,355,Bamyan,100.0,520.0,1.0,Afghanistan,276.0,...,NO,3000.0,Member State,1000.0,1.762777,8.587412,Islamic Republic of Afghanistan,AFG,"POLYGON ((66.29666 35.13259, 66.29699 35.13380...",Bamyan
4,2022-0514-AFG,Farah,AFG,513,Farah,100.0,521.0,1.0,Afghanistan,277.0,...,NO,3000.0,Member State,1000.0,4.718270,15.358933,Islamic Republic of Afghanistan,AFG,"POLYGON ((60.58315 33.12982, 60.58746 33.13812...",Farah
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891,2022-0569-ZWE,Manicaland,ZWE,563,Manicaland,100.0,134.0,271.0,Zimbabwe,3437.0,...,NO,3000.0,Member State,1000.0,3.062811,13.462677,Republic of Zimbabwe,ZWE,"POLYGON ((31.18835 -19.23481, 31.22039 -19.231...",Manicaland
892,2023-0719-ZWE,Masvingo,ZWE,1512,Masvingo,100.0,137.0,271.0,Zimbabwe,3441.0,...,NO,3000.0,Member State,1000.0,4.898562,12.419403,Republic of Zimbabwe,ZWE,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",Masvingo
893,2022-0569-ZWE,Masvingo,ZWE,562,Masvingo,100.0,137.0,271.0,Zimbabwe,3441.0,...,NO,3000.0,Member State,1000.0,4.898562,12.419403,Republic of Zimbabwe,ZWE,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",Masvingo
894,2023-0719-ZWE,Matabeleland South,ZWE,1511,Matabeleland South,100.0,138.0,271.0,Zimbabwe,3443.0,...,NO,3000.0,Member State,1000.0,4.720131,16.330050,Republic of Zimbabwe,ZWE,"POLYGON ((26.69159 -19.89356, 26.69162 -19.893...",Matabeleland South


In [39]:
events_admin1_locations_only_valid[['name_join_df1','name_join_df2']].iloc[100:150]

Unnamed: 0,name_join_df1,name_join_df2
100,Henan,Henan Sheng
101,Hunan,Hunan Sheng
102,Hunan,Hunan Sheng
103,Hunan,Hunan Sheng
104,Jiangsu,Jiangsu Sheng
105,Jiangxi,Jiangxi Sheng
106,Ningxia,Ningxia Huizu Zizhiqu
107,QingHai,Qinghai Sheng
108,Qinghai,Qinghai Sheng
109,Qinghai,Qinghai Sheng


In [227]:
events_admin1_locations_only_valid[['name_join_df1','name_join_df2']].iloc[147]

name_join_df1    San Juan Municipalities (Bolivar Department); ...
name_join_df2                                   Norte De Santander
Name: 147, dtype: object

In [228]:
#gaul2['ADM2_NAME_join'] = gaul2[['ADM1_NAME', 'ADM2_NAME']].astype(str).agg('  '.join, axis=1)

In [43]:
gaul2['ADM2_NAME_join'] = gaul2[['ADM2_NAME']]

In [44]:
#Administrative level 2
Emdata_locations_only['ADM2_NAME_join'] = Emdata_locations_only['ADM1_NAME']
final_db2 = gpd.GeoDataFrame()#empty geoDF for results

for iso in isos:
    db2 = match_emdat_gdis(Emdata_locations_only, gaul2, iso,"ADM2_NAME_join")
    if db2.empty:
        pass
    else:
        final_db2 = pd.concat([final_db2,db2],axis=0)
        
locations_only_events_admin2 = final_db2[~final_db2['geometry'].isna()].reset_index().drop(columns={'index'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [45]:
events_admin2_locations_only_valid = locations_only_events_admin2[locations_only_events_admin2.match_score >= 90]

In [53]:
#consider only geometries where the location string exists in the geometry string name of the gaul object
#events_admin2_locations_only_valid = locations_only_events_admin2[locations_only_events_admin2.apply(lambda x: x['name_join_df2'] in x['name_join_df1'], axis=1)]

In [46]:
events_admin2_locations_only_valid

Unnamed: 0,DisNo.,ADM1_NAME_df1,iso3_df1,name_join_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,...,EXP2_YEAR,STATUS,STR2_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,ADM2_NAME_join,name_join_df2
0,2022-0264-AFG,Baghlan,AFG,Baghlan,351,Baghlan-e-Jadid,90.0,16297.0,1.0,Afghanistan,...,3000.0,Member State,2004.0,0.260793,2.642237,Islamic Republic of Afghanistan,AFG,"POLYGON ((68.53223 36.15923, 68.53070 36.16115...",Baghlan-e-Jadid,Baghlan-e-Jadid
1,2022-0264-AFG,Bamyan,AFG,Bamyan,355,Bamyan,100.0,16404.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.177110,2.128533,Islamic Republic of Afghanistan,AFG,"POLYGON ((67.36119 34.80934, 67.36119 34.81523...",Bamyan,Bamyan
2,2022-0514-AFG,Farah,AFG,Farah,513,Farah,100.0,17376.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.326498,3.870752,Islamic Republic of Afghanistan,AFG,"POLYGON ((61.88133 32.30833, 61.88319 32.31950...",Farah,Farah
3,2022-0514-AFG,Ghazni,AFG,Ghazni,506,Ghazni,100.0,16352.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.034933,0.988983,Islamic Republic of Afghanistan,AFG,"POLYGON ((68.51923 33.41982, 68.51812 33.41982...",Ghazni,Ghazni
4,2023-0636-AFG,Herat,AFG,Herat,1439,Herat,100.0,17426.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.008094,0.414472,Islamic Republic of Afghanistan,AFG,"POLYGON ((62.11880 34.37322, 62.11942 34.37507...",Herat,Herat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,2023-0719-ZWE,Mwenezi,ZWE,Mwenezi,1507,Mwenezi,100.0,10422.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,1.146266,6.310072,Republic of Zimbabwe,ZWE,"POLYGON ((30.94386 -22.11357, 30.93747 -22.110...",Mwenezi,Mwenezi
665,2023-0719-ZWE,Seke,ZWE,Seke,1508,Seke,100.0,2531.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.225164,3.924185,Republic of Zimbabwe,ZWE,"POLYGON ((30.76508 -18.50712, 30.76493 -18.504...",Seke,Seke
666,2023-0719-ZWE,Shamva,ZWE,Shamva,1504,Shamva,100.0,2509.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.228929,3.284518,Republic of Zimbabwe,ZWE,"POLYGON ((31.50866 -17.50133, 31.50852 -17.501...",Shamva,Shamva
667,2023-0719-ZWE,Wedza,ZWE,Wedza,1509,Hwedza,91.0,2526.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.219286,2.792234,Republic of Zimbabwe,ZWE,"POLYGON ((31.98308 -19.11863, 31.98306 -19.118...",Hwedza,Hwedza


In [52]:
events_admin2_locations_only_valid[['name_join_df1','name_join_df2']].iloc[300:350]

Unnamed: 0,name_join_df1,name_join_df2
300,Puttlam,Puttalam
301,Ratnapura s,Ratnapura
302,and Matara s,Matara
303,Leribe,Leribe
304,Al Haouz,Al Haouz
305,Azilal,Azilal
306,Chichaoua,Chichaoua
307,Guelmim,Guelmim
308,Marrakech,Marrakech
309,Ouarzazate,Ouarzazate


In [47]:
# Merge the two DataFrames based on the columns 'name_join_df1' and 'DisNo.'
merged_df = pd.merge(events_admin2_locations_only_valid,
                     events_admin1_locations_only_valid,
                     on=['name_join_df1', 'DisNo.'],
                     how='left',
                     suffixes=('', '_right'),
                     indicator=True)

# Keep only the rows from events_admin2_locations_only_valid that are not present in events_admin1_locations_only_valid
events_admin2_locations_only_valid = merged_df.loc[merged_df['_merge'] == 'left_only', events_admin2_locations_only_valid.columns]


In [48]:
events_admin2_locations_only_valid 

Unnamed: 0,DisNo.,ADM1_NAME_df1,iso3_df1,name_join_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,...,EXP2_YEAR,STATUS,STR2_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,ADM2_NAME_join,name_join_df2
4,2023-0636-AFG,Herat,AFG,Herat,1439,Herat,100.0,17426.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.008094,0.414472,Islamic Republic of Afghanistan,AFG,"POLYGON ((62.11880 34.37322, 62.11942 34.37507...",Herat,Herat
5,2022-0264-AFG,Herat,AFG,Herat,345,Herat,100.0,17426.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.008094,0.414472,Islamic Republic of Afghanistan,AFG,"POLYGON ((62.11880 34.37322, 62.11942 34.37507...",Herat,Herat
12,2022-0806-AFG,Salang,AFG,Salang,717,Salang,100.0,16455.0,1.0,Afghanistan,...,3000.0,Member State,1000.0,0.051562,1.139820,Islamic Republic of Afghanistan,AFG,"POLYGON ((69.24382 35.16703, 69.24326 35.16527...",Salang,Salang
21,2023-0644-BEN,Seme Podji,BEN,Seme Podji,1443,Seme-kpodji,95.0,2595.0,29.0,Benin,...,3000.0,Member State,1000.0,0.015388,0.736261,Republic of Benin,BEN,"POLYGON ((2.49788 6.36010, 2.49787 6.36097, 2....",Seme-kpodji,Seme-kpodji
22,2023-0512-BGD,Bandarban,BGD,Bandarban,1298,Bandarban,100.0,15729.0,23.0,Bangladesh,...,3000.0,Member State,1000.0,0.403544,5.417817,People's Republic of Bangladesh,BGD,"POLYGON ((92.05993 21.90284, 92.05996 21.90319...",Bandarban,Bandarban
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,2023-0719-ZWE,Mwenezi,ZWE,Mwenezi,1507,Mwenezi,100.0,10422.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,1.146266,6.310072,Republic of Zimbabwe,ZWE,"POLYGON ((30.94386 -22.11357, 30.93747 -22.110...",Mwenezi,Mwenezi
669,2023-0719-ZWE,Seke,ZWE,Seke,1508,Seke,100.0,2531.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.225164,3.924185,Republic of Zimbabwe,ZWE,"POLYGON ((30.76508 -18.50712, 30.76493 -18.504...",Seke,Seke
670,2023-0719-ZWE,Shamva,ZWE,Shamva,1504,Shamva,100.0,2509.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.228929,3.284518,Republic of Zimbabwe,ZWE,"POLYGON ((31.50866 -17.50133, 31.50852 -17.501...",Shamva,Shamva
671,2023-0719-ZWE,Wedza,ZWE,Wedza,1509,Hwedza,91.0,2526.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.219286,2.792234,Republic of Zimbabwe,ZWE,"POLYGON ((31.98308 -19.11863, 31.98306 -19.118...",Hwedza,Hwedza


In [49]:
#for dataset with bracket , repeating the same process , but only in admin 2


Emdata_locations_only = new_df_2[['DisNo.','Appended']]

Emdata_locations_only.reset_index(drop=True, inplace=True)


iso_locations = Emdata_locations_only['DisNo.'].str.split('-',expand=True)[2]
Emdata_locations_only['iso3'] = iso_locations #add iso3 codes of every location/ event

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [50]:
isos = np.unique(Emdata_locations_only["iso3"])#iso codes for the loop
final_db = gpd.GeoDataFrame()#empty geoDF for results

Emdata_locations_only = Emdata_locations_only.rename(columns={'Appended':"ADM1_NAME"})#change name of column to merge on

for iso in isos:
    db = match_emdat_gdis(Emdata_locations_only, gaul1, iso,"ADM1_NAME")
    if db.empty:
        pass
    else:
        final_db = pd.concat([final_db,db],axis=0)
        
locations_only_events_admin1 = final_db[~final_db['geometry'].isna()].reset_index().drop(columns={'index'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [51]:
events_admin1_locations_only_valid1 = locations_only_events_admin1[locations_only_events_admin1.match_score >= 90]


In [52]:
events_admin1_locations_only_valid1

Unnamed: 0,DisNo.,name_join_df1,iso3_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,ADM1_CODE,...,DISP_AREA,EXP1_YEAR,STATUS,STR1_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,name_join_df2
0,2022-0514-AFG,Shenwari s Parwan,AFG,224,Parwan,90.0,514.0,1.0,Afghanistan,99880.0,...,NO,3000.0,Member State,2004.0,0.552083,5.265414,Islamic Republic of Afghanistan,AFG,"POLYGON ((68.18910 35.01769, 68.18911 35.01787...",Parwan
1,2022-0514-AFG,Siagard Parwan,AFG,223,Parwan,90.0,514.0,1.0,Afghanistan,99880.0,...,NO,3000.0,Member State,2004.0,0.552083,5.265414,Islamic Republic of Afghanistan,AFG,"POLYGON ((68.18910 35.01769, 68.18911 35.01787...",Parwan
2,2023-0831-ARG,Bahía Blanca Buenos Aires,ARG,812,Buenos Aires,90.0,3351.0,12.0,Argentina,429.0,...,NO,3000.0,Member State,1000.0,30.833566,64.151403,Argentine Republic,ARG,"MULTIPOLYGON (((-62.73597 -41.04164, -62.73970...",Buenos Aires
3,2023-0831-ARG,Moreno Cities Buenos Aires,ARG,813,Buenos Aires,90.0,3351.0,12.0,Argentina,429.0,...,NO,3000.0,Member State,1000.0,30.833566,64.151403,Argentine Republic,ARG,"MULTIPOLYGON (((-62.73597 -41.04164, -62.73970...",Buenos Aires
4,2023-0138-AUS,northern Queensland,AUS,421,Queensland,90.0,2883.0,17.0,Australia,473.0,...,NO,3000.0,Member State,1000.0,152.276773,150.144387,Australia,AUS,"MULTIPOLYGON (((139.55596 -17.11589, 139.55796...",Queensland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,2022-0052-ZWE,Masvingo rural Masvingo,ZWE,63,Masvingo,90.0,137.0,271.0,Zimbabwe,3441.0,...,NO,3000.0,Member State,1000.0,4.898562,12.419403,Republic of Zimbabwe,ZWE,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",Masvingo
436,2022-0052-ZWE,Mudzi Mashonaland East,ZWE,61,Mashonaland East,95.0,35.0,271.0,Zimbabwe,69550.0,...,NO,3000.0,Member State,1997.0,2.736920,12.574162,Republic of Zimbabwe,ZWE,"POLYGON ((30.55430 -18.92903, 30.55430 -18.928...",Mashonaland East
437,2022-0052-ZWE,Mutasa Manicaland,ZWE,65,Manicaland,90.0,134.0,271.0,Zimbabwe,3437.0,...,NO,3000.0,Member State,1000.0,3.062811,13.462677,Republic of Zimbabwe,ZWE,"POLYGON ((31.18835 -19.23481, 31.22039 -19.231...",Manicaland
438,2022-0052-ZWE,Muzarabani tMashonaland Central,ZWE,60,Mashonaland Central,90.0,135.0,271.0,Zimbabwe,3438.0,...,NO,3000.0,Member State,1000.0,2.382755,10.800846,Republic of Zimbabwe,ZWE,"POLYGON ((30.04615 -16.00357, 30.04615 -16.003...",Mashonaland Central


In [53]:
gaul2['ADM2_NAME_join'] = gaul2[['ADM1_NAME', 'ADM2_NAME']].astype(str).agg('  '.join, axis=1)

In [54]:
Emdata_locations_only['ADM2_NAME_join'] = Emdata_locations_only['ADM1_NAME']
final_db2 = gpd.GeoDataFrame()#empty geoDF for results

for iso in isos:
    db2 = match_emdat_gdis(Emdata_locations_only, gaul2, iso,"ADM2_NAME_join")
    if db2.empty:
        pass
    else:
        final_db2 = pd.concat([final_db2,db2],axis=0)
        
locations_only_events_admin2 = final_db2[~final_db2['geometry'].isna()].reset_index().drop(columns={'index'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [55]:
events_admin2_locations_only_valid1 = locations_only_events_admin2[locations_only_events_admin2.match_score >= 90]


In [56]:
events_admin2_locations_only_valid1

Unnamed: 0,DisNo.,ADM1_NAME_df1,iso3_df1,name_join_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,...,EXP2_YEAR,STATUS,STR2_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,ADM2_NAME_join,name_join_df2
0,2023-0831-ARG,Bahía Blanca Buenos Aires,ARG,Bahía Blanca Buenos Aires,812,Buenos Aires Bahia Blanca,93.0,37751.0,12.0,Argentina,...,3000.0,Member State,1000.0,0.232311,3.292568,Argentine Republic,ARG,"MULTIPOLYGON (((-62.42643 -38.75657, -62.42643...",Buenos Aires Bahia Blanca,Buenos Aires Bahia Blanca
1,2023-0831-ARG,Moreno Cities Buenos Aires,ARG,Moreno Cities Buenos Aires,813,Buenos Aires Moreno,95.0,37817.0,12.0,Argentina,...,3000.0,Member State,1000.0,0.018316,0.602528,Argentine Republic,ARG,"POLYGON ((-58.86567 -34.68238, -58.88124 -34.6...",Buenos Aires Moreno,Buenos Aires Moreno
2,2023-0828-AUS,Cairns Queensland,AUS,Cairns Queensland,806,Queensland Cairns (R),95.0,24857.0,17.0,Australia,...,3000.0,Member State,2011.0,0.348500,6.728153,Australia,AUS,"MULTIPOLYGON (((145.69512 -16.73106, 145.69528...",Queensland Cairns (R),Queensland Cairns (R)
3,2023-0138-AUS,Doomadgee northern Queensland,AUS,Doomadgee northern Queensland,420,Queensland Doomadgee (S),90.0,24862.0,17.0,Australia,...,3000.0,Member State,2011.0,0.156206,2.406009,Australia,AUS,"MULTIPOLYGON (((138.58381 -17.87517, 138.58594...",Queensland Doomadgee (S),Queensland Doomadgee (S)
4,2023-0828-AUS,Hope Vale Queensland,AUS,Hope Vale Queensland,811,Queensland Hope Vale (S),95.0,24864.0,17.0,Australia,...,3000.0,Member State,2011.0,0.092905,2.955616,Australia,AUS,"POLYGON ((145.23842 -15.44147, 145.21484 -15.4...",Queensland Hope Vale (S),Queensland Hope Vale (S)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,2022-0052-ZWE,Kariba Mashonaland West,ZWE,Kariba Mashonaland West,58,Mashonaland West Kariba,95.0,2513.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.691808,4.535196,Republic of Zimbabwe,ZWE,"POLYGON ((28.01492 -16.87896, 28.01525 -16.878...",Mashonaland West Kariba,Mashonaland West Kariba
220,2022-0052-ZWE,Masvingo rural Masvingo,ZWE,Masvingo rural Masvingo,63,Masvingo Masvingo,95.0,10421.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.598281,5.458565,Republic of Zimbabwe,ZWE,"POLYGON ((31.40310 -20.00350, 31.39930 -20.009...",Masvingo Masvingo,Masvingo Masvingo
221,2022-0052-ZWE,Mudzi Mashonaland East,ZWE,Mudzi Mashonaland East,61,Mashonaland East Mudzi,95.0,2528.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.354802,3.043542,Republic of Zimbabwe,ZWE,"POLYGON ((32.99403 -17.24688, 32.99388 -17.246...",Mashonaland East Mudzi,Mashonaland East Mudzi
222,2022-0052-ZWE,Mutasa Manicaland,ZWE,Mutasa Manicaland,65,Manicaland Mutasa,95.0,2502.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.212884,3.362333,Republic of Zimbabwe,ZWE,"POLYGON ((32.48668 -18.76334, 32.48670 -18.762...",Manicaland Mutasa,Manicaland Mutasa


In [57]:
events_admin2_locations_only_valid = pd.concat([events_admin2_locations_only_valid1, events_admin2_locations_only_valid])


In [58]:
events_admin2_locations_only_valid

Unnamed: 0,DisNo.,ADM1_NAME_df1,iso3_df1,name_join_df1,df_left_id,match_string,match_score,df_right_id,ADM0_CODE,ADM0_NAME,...,EXP2_YEAR,STATUS,STR2_YEAR,Shape_Area,Shape_Leng,official_name,iso3_df2,geometry,ADM2_NAME_join,name_join_df2
0,2023-0831-ARG,Bahía Blanca Buenos Aires,ARG,Bahía Blanca Buenos Aires,812,Buenos Aires Bahia Blanca,93.0,37751.0,12.0,Argentina,...,3000.0,Member State,1000.0,0.232311,3.292568,Argentine Republic,ARG,"MULTIPOLYGON (((-62.42643 -38.75657, -62.42643...",Buenos Aires Bahia Blanca,Buenos Aires Bahia Blanca
1,2023-0831-ARG,Moreno Cities Buenos Aires,ARG,Moreno Cities Buenos Aires,813,Buenos Aires Moreno,95.0,37817.0,12.0,Argentina,...,3000.0,Member State,1000.0,0.018316,0.602528,Argentine Republic,ARG,"POLYGON ((-58.86567 -34.68238, -58.88124 -34.6...",Buenos Aires Moreno,Buenos Aires Moreno
2,2023-0828-AUS,Cairns Queensland,AUS,Cairns Queensland,806,Queensland Cairns (R),95.0,24857.0,17.0,Australia,...,3000.0,Member State,2011.0,0.348500,6.728153,Australia,AUS,"MULTIPOLYGON (((145.69512 -16.73106, 145.69528...",Queensland Cairns (R),Queensland Cairns (R)
3,2023-0138-AUS,Doomadgee northern Queensland,AUS,Doomadgee northern Queensland,420,Queensland Doomadgee (S),90.0,24862.0,17.0,Australia,...,3000.0,Member State,2011.0,0.156206,2.406009,Australia,AUS,"MULTIPOLYGON (((138.58381 -17.87517, 138.58594...",Queensland Doomadgee (S),Queensland Doomadgee (S)
4,2023-0828-AUS,Hope Vale Queensland,AUS,Hope Vale Queensland,811,Queensland Hope Vale (S),95.0,24864.0,17.0,Australia,...,3000.0,Member State,2011.0,0.092905,2.955616,Australia,AUS,"POLYGON ((145.23842 -15.44147, 145.21484 -15.4...",Queensland Hope Vale (S),Queensland Hope Vale (S)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,2023-0719-ZWE,Mwenezi,ZWE,Mwenezi,1507,Mwenezi,100.0,10422.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,1.146266,6.310072,Republic of Zimbabwe,ZWE,"POLYGON ((30.94386 -22.11357, 30.93747 -22.110...",Mwenezi,Mwenezi
669,2023-0719-ZWE,Seke,ZWE,Seke,1508,Seke,100.0,2531.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.225164,3.924185,Republic of Zimbabwe,ZWE,"POLYGON ((30.76508 -18.50712, 30.76493 -18.504...",Seke,Seke
670,2023-0719-ZWE,Shamva,ZWE,Shamva,1504,Shamva,100.0,2509.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.228929,3.284518,Republic of Zimbabwe,ZWE,"POLYGON ((31.50866 -17.50133, 31.50852 -17.501...",Shamva,Shamva
671,2023-0719-ZWE,Wedza,ZWE,Wedza,1509,Hwedza,91.0,2526.0,271.0,Zimbabwe,...,3000.0,Member State,1000.0,0.219286,2.792234,Republic of Zimbabwe,ZWE,"POLYGON ((31.98308 -19.11863, 31.98306 -19.118...",Hwedza,Hwedza


### shape geodataframe to match GDIS database

In [59]:
####Shape all geodataframes and set same columns

events_adm1_clean = events_adm1[['DisNo.','iso3','official_name','ADM0_CODE','ADM0_NAME','ADM1_CODE','ADM1_NAME','location', 'geolocation','geometry']]
events_adm1_clean[['ADM2_CODE','ADM2_NAME']] = "None"
events_adm1_clean[['level']] = 1

events_adm2_clean = events_adm2[['DisNo.','iso3','official_name','ADM0_CODE','ADM0_NAME','ADM1_CODE','ADM1_NAME','ADM2_CODE','ADM2_NAME','location', 'geolocation','geometry']]
events_adm2_clean[['level']] = 2

events_adm1_loconly_clean = events_admin1_locations_only_valid.reset_index()[['DisNo.','iso3_df1','official_name','ADM0_CODE','ADM0_NAME','ADM1_CODE','ADM1_NAME','match_string','geometry']]
events_adm1_loconly_clean = events_adm1_loconly_clean.rename(columns={'iso3_df1':'iso3','match_string':'location'})
events_adm1_loconly_clean[['ADM2_CODE','ADM2_NAME','geolocation']] = "None"
events_adm1_loconly_clean[['level']] = 1

events_adm2_loconly_clean = events_admin2_locations_only_valid.reset_index()[['DisNo.','iso3_df1','official_name','ADM0_CODE','ADM0_NAME','ADM1_CODE','ADM1_NAME_df1','ADM2_CODE','ADM2_NAME','match_string','geometry']]
events_adm2_loconly_clean = events_adm2_loconly_clean.rename(columns={'iso3_df1':'iso3','match_string':'location'})
events_adm2_loconly_clean[['geolocation']] = "None"
events_adm2_loconly_clean[['level']] = 2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value


In [60]:
events_adm1_loconly_clean

Unnamed: 0,DisNo.,iso3,official_name,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME,location,geometry,ADM2_CODE,ADM2_NAME,geolocation,level
0,2022-0264-AFG,AFG,Islamic Republic of Afghanistan,1.0,Afghanistan,272.0,Badakhshan,Badakhshan,"POLYGON ((69.98554 36.78234, 69.98748 36.78352...",,,,1
1,2022-0264-AFG,AFG,Islamic Republic of Afghanistan,1.0,Afghanistan,273.0,Badghis,Badghis,"POLYGON ((62.66683 34.99415, 62.66690 34.99676...",,,,1
2,2022-0264-AFG,AFG,Islamic Republic of Afghanistan,1.0,Afghanistan,274.0,Baghlan,Baghlan,"POLYGON ((68.00412 35.13949, 68.00416 35.14001...",,,,1
3,2022-0264-AFG,AFG,Islamic Republic of Afghanistan,1.0,Afghanistan,276.0,Bamyan,Bamyan,"POLYGON ((66.29666 35.13259, 66.29699 35.13380...",,,,1
4,2022-0514-AFG,AFG,Islamic Republic of Afghanistan,1.0,Afghanistan,277.0,Farah,Farah,"POLYGON ((60.58315 33.12982, 60.58746 33.13812...",,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
891,2022-0569-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,3437.0,Manicaland,Manicaland,"POLYGON ((31.18835 -19.23481, 31.22039 -19.231...",,,,1
892,2023-0719-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,3441.0,Masvingo,Masvingo,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",,,,1
893,2022-0569-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,3441.0,Masvingo,Masvingo,"POLYGON ((29.73057 -21.02383, 29.73119 -21.024...",,,,1
894,2023-0719-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,3443.0,Matabeleland South,Matabeleland South,"POLYGON ((26.69159 -19.89356, 26.69162 -19.893...",,,,1


In [66]:
events_adm2_loconly_clean

Unnamed: 0,DisNo.,iso3,official_name,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME_df1,ADM2_CODE,ADM2_NAME,location,geometry,geolocation,level
0,2023-0831-ARG,ARG,Argentine Republic,12.0,Argentina,429.0,Bahía Blanca Buenos Aires,4389.0,Bahia Blanca,Buenos Aires Bahia Blanca,"MULTIPOLYGON (((-62.42643 -38.75657, -62.42643...",,2
1,2023-0831-ARG,ARG,Argentine Republic,12.0,Argentina,429.0,Moreno Cities Buenos Aires,4462.0,Moreno,Buenos Aires Moreno,"POLYGON ((-58.86567 -34.68238, -58.88124 -34.6...",,2
2,2023-0828-AUS,AUS,Australia,17.0,Australia,473.0,Cairns Queensland,154657.0,Cairns (R),Queensland Cairns (R),"MULTIPOLYGON (((145.69512 -16.73106, 145.69528...",,2
3,2023-0138-AUS,AUS,Australia,17.0,Australia,473.0,Doomadgee northern Queensland,154664.0,Doomadgee (S),Queensland Doomadgee (S),"MULTIPOLYGON (((138.58381 -17.87517, 138.58594...",,2
4,2023-0828-AUS,AUS,Australia,17.0,Australia,473.0,Hope Vale Queensland,154671.0,Hope Vale (S),Queensland Hope Vale (S),"POLYGON ((145.23842 -15.44147, 145.21484 -15.4...",,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,2023-0719-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,3441.0,Mwenezi,33086.0,Mwenezi,Mwenezi,"POLYGON ((30.94386 -22.11357, 30.93747 -22.110...",,2
511,2023-0719-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,69550.0,Seke,33073.0,Seke,Seke,"POLYGON ((30.76508 -18.50712, 30.76493 -18.504...",,2
512,2023-0719-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,3438.0,Shamva,33066.0,Shamva,Shamva,"POLYGON ((31.50866 -17.50133, 31.50852 -17.501...",,2
513,2023-0719-ZWE,ZWE,Republic of Zimbabwe,271.0,Zimbabwe,69550.0,Wedza,33068.0,Hwedza,Hwedza,"POLYGON ((31.98308 -19.11863, 31.98306 -19.118...",,2


In [None]:
new_df

In [62]:
freq_emdata = new_df['DisNo.'].value_counts()

# Count the frequency of each DisNo. in locations_only_events_admin1
freq_admin1 = events_adm1_loconly_clean['DisNo.'].value_counts()

# Count the frequency of each DisNo. in vents_adm2_loconly_clean
freq_admin2 = events_adm2_loconly_clean['DisNo.'].value_counts()

# Combine the frequencies into a single DataFrame
combined_freq = pd.DataFrame({
    'Emdata': freq_emdata, 
    'Admin1': freq_admin1, 
    'Admin2': freq_admin2
}).fillna(0)

# Calculate the ratio for each DisNo.
combined_freq['Ratio_Admin1'] = combined_freq['Admin1'] / combined_freq['Emdata']
combined_freq['Ratio_Admin2'] = combined_freq['Admin2'] / combined_freq['Emdata']
combined_freq['total_ratio'] =  combined_freq['Ratio_Admin1'] + combined_freq['Ratio_Admin2'] 
# Display the result
print(combined_freq)


               Emdata  Admin1  Admin2  Ratio_Admin1  Ratio_Admin2  total_ratio
2020-9609-SOM      17     3.0     1.0      0.176471      0.058824     0.235294
2021-0012-BGD       1     0.0     1.0      0.000000      1.000000     1.000000
2021-0029-IND       1     0.0     1.0      0.000000      1.000000     1.000000
2021-0030-ECU       1     1.0     0.0      1.000000      0.000000     1.000000
2021-0032-IDN       1     0.0     0.0      0.000000      0.000000     0.000000
...               ...     ...     ...           ...           ...          ...
2023-9589-URY       5     5.0     0.0      1.000000      0.000000     1.000000
2023-9651-BRA       1     1.0     0.0      1.000000      0.000000     1.000000
2023-9706-BOL       5     5.0     0.0      1.000000      0.000000     1.000000
2023-9868-USA      10    10.0     0.0      1.000000      0.000000     1.000000
2023-9873-FSM       4     0.0     0.0      0.000000      0.000000     0.000000

[835 rows x 6 columns]


In [67]:
disno_to_remove = combined_freq[combined_freq['total_ratio'] < 0.6].index

# Filter out these DisNo. from events_adm1_loconly_clean and events_adm2_loconly_clean
events_adm1_loconly_clean = events_adm1_loconly_clean[~events_adm1_loconly_clean['DisNo.'].isin(disno_to_remove)].reset_index(drop=True)
events_adm2_loconly_clean = events_adm2_loconly_clean[~events_adm2_loconly_clean['DisNo.'].isin(disno_to_remove)].reset_index(drop=True)

# Display the result
print("Filtered events_adm1_loconly_clean:")
print(events_adm1_loconly_clean)
print("\nFiltered events_adm2_loconly_clean:")
print(events_adm2_loconly_clean)

Filtered events_adm1_loconly_clean:
            DisNo. iso3                    official_name  ADM0_CODE  \
0    2022-0264-AFG  AFG  Islamic Republic of Afghanistan        1.0   
1    2022-0264-AFG  AFG  Islamic Republic of Afghanistan        1.0   
2    2022-0264-AFG  AFG  Islamic Republic of Afghanistan        1.0   
3    2022-0264-AFG  AFG  Islamic Republic of Afghanistan        1.0   
4    2022-0514-AFG  AFG  Islamic Republic of Afghanistan        1.0   
..             ...  ...                              ...        ...   
757  2023-0619-ZAF  ZAF         Republic of South Africa      227.0   
758  2023-0719-ZWE  ZWE             Republic of Zimbabwe      271.0   
759  2023-0719-ZWE  ZWE             Republic of Zimbabwe      271.0   
760  2023-0719-ZWE  ZWE             Republic of Zimbabwe      271.0   
761  2023-0719-ZWE  ZWE             Republic of Zimbabwe      271.0   

        ADM0_NAME  ADM1_CODE           ADM1_NAME            location  \
0     Afghanistan      272.0          B

In [68]:
geolocated_events = pd.concat([events_adm1_clean, events_adm2_clean,events_adm1_loconly_clean,events_adm2_loconly_clean])
geolocated_events['disaster_number_country'] = geolocated_events['DisNo.']
geolocated_events = geolocated_events.set_index('DisNo.').sort_index()

In [69]:
geolocated_events[['ADM0_CODE','ADM1_CODE']] = geolocated_events[['ADM0_CODE','ADM1_CODE']].astype(int)

In [70]:
gaul1_subset = gaul1[['ADM1_CODE','ADM1_NAME']]

In [71]:
def find_location(x):
    return gaul1_subset[gaul1_subset.ADM1_CODE == x]['ADM1_NAME'].values[0]

In [77]:
ADM1_NAMES_data = geolocated_events['ADM1_CODE'].apply(lambda x: find_location(x))

In [78]:
ADM1_NAMES_data

DisNo.
2020-0051-IRL         Laois
2020-0051-IRL         Sligo
2021-0001-BOL          Beni
2021-0001-BOL    Cochabamba
2021-0001-BOL        La Paz
                    ...    
2023-9868-USA      Oklahoma
2023-9868-USA      Nebraska
2023-9868-USA       Montana
2023-9868-USA    Washington
2023-9868-USA      Missouri
Name: ADM1_CODE, Length: 4508, dtype: object

In [79]:
geolocated_events['ADM1_NAME'] = ADM1_NAMES_data

In [80]:
geolocated_events = geolocated_events.drop(columns='ADM1_NAME_df1')

In [81]:
len(np.unique(geolocated_events.index))

1155

In [82]:
print(geolocated_events)

              iso3                   official_name  ADM0_CODE  \
DisNo.                                                          
2020-0051-IRL  IRL                         Ireland        119   
2020-0051-IRL  IRL                         Ireland        119   
2021-0001-BOL  BOL  Plurinational State of Bolivia         33   
2021-0001-BOL  BOL  Plurinational State of Bolivia         33   
2021-0001-BOL  BOL  Plurinational State of Bolivia         33   
...            ...                             ...        ...   
2023-9868-USA  USA        United States of America        259   
2023-9868-USA  USA        United States of America        259   
2023-9868-USA  USA        United States of America        259   
2023-9868-USA  USA        United States of America        259   
2023-9868-USA  USA        United States of America        259   

                              ADM0_NAME  ADM1_CODE   ADM1_NAME  \
DisNo.                                                           
2020-0051-IRL         

In [75]:
geolocated_events.columns

Index(['iso3', 'official_name', 'ADM0_CODE', 'ADM0_NAME', 'ADM1_CODE',
       'ADM1_NAME', 'location', 'geolocation', 'geometry', 'ADM2_CODE',
       'ADM2_NAME', 'level', 'ADM1_NAME_df1', 'disaster_number_country'],
      dtype='object')

In [83]:
geolocated_events = gpd.GeoDataFrame(geolocated_events)
geolocated_events.to_file('geolocated_events_clean_2021-2023_final.gpkg', driver="GPKG")

In [170]:
!ls -lh



total 2.1G
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  573K May 18 16:12 02_geolocate_events-2.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  593K Apr 24 23:12 02_geolocate_events-Copy1.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  663K May 19 18:29 02_geolocate_events-mod.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  678K May 19 15:05 02_geolocate_events.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy   442 Feb 28 17:14 README.md
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy   983 Feb 29 15:28 Untitled.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  2.0K Apr 24 22:54 Untitled1.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  3.9K May  5 16:35 agg_data.csv
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy   930 Feb 28 17:14 aggreg_country_script.py
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy   21K May  5 20:50 aggregation_functions.py
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  866K May  4 16:37 code_analysis.ipynb
-rw-rw-r-- 1 dchinthaparthy dchinthaparthy  3.0K May  4 16:43 count

In [95]:
import os

# Specify the path to the folder you want to check
folder_path = '/net/projects/xaida/intermediate_data/shk_data/'

# Try to create a temporary file in the folder
temp_file_path = os.path.join(folder_path, 'temp_file.txt')
try:
    # Create the temporary file
    with open(temp_file_path, 'w'):
        pass  # Do nothing, just create the file
    print("You have access to the folder.")
except Exception as e:
    print(f"Error: {e}")
    print("You do not have access to the folder or there was an error creating the file.")
finally:
    # Remove the temporary file
    if os.path.exists(temp_file_path):
        os.remove(temp_file_path)


Error: [Errno 13] Permission denied: '/net/projects/xaida/intermediate_data/shk_data/temp_file.txt'
You do not have access to the folder or there was an error creating the file.


In [1]:
cp geolocated_events_clean.gpkg /net/scratch/XAIDA/share_folder


In [86]:
cp geolocated_events_clean_2021-2023_final.gpkg /net/scratch/XAIDA/share_folder


In [90]:
data = gpd.read_file('geolocated_events_clean.gpkg')

In [91]:
data[data.iso3 == 'BOL']

Unnamed: 0,DisNo.,iso3,official_name,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME,location,geolocation,ADM2_CODE,ADM2_NAME,level,disaster_number_country,geometry
5,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40443,Beni,Cochabamba,Tarija,,,1,2021-0001-BOL,"POLYGON ((-67.56518 -14.04564, -67.56469 -14.0..."
6,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40445,Cochabamba,Tarija,Cochabamba,,,1,2021-0001-BOL,"POLYGON ((-67.00473 -16.95181, -67.00468 -16.9..."
7,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40446,La Paz,Beni,Beni,,,1,2021-0001-BOL,"POLYGON ((-69.64076 -17.28202, -69.63731 -17.2..."
8,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40450,Tarija,La Paz Departments,La Paz,,,1,2021-0001-BOL,"POLYGON ((-65.41817 -21.89328, -65.41816 -21.8..."
62,2021-0027-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40444,Chuquisaca,Sucre City (Chuquisaca Department),. Oropeza (Adm2),40458.0,Oropeza,2,2021-0027-BOL,"POLYGON ((-65.69396 -18.35985, -65.69388 -18.3..."
74,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40444,Chuquisaca,Sucre City (Chuquisaca Department),Oropeza (Adm2),40458.0,Oropeza,2,2021-0035-BOL,"POLYGON ((-65.69396 -18.35985, -65.69388 -18.3..."
75,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40445,Cochabamba,Sucre City (Chuquisaca Department),Cochabamba,,,1,2021-0035-BOL,"POLYGON ((-67.00473 -16.95181, -67.00468 -16.9..."
76,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40446,La Paz,Paz Department,Potosi,,,1,2021-0035-BOL,"POLYGON ((-69.64076 -17.28202, -69.63731 -17.2..."
77,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40448,Potosi,Potosì Department,La Paz,,,1,2021-0035-BOL,"POLYGON ((-68.78247 -20.08777, -68.78247 -20.0..."
183,2021-0138-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40445,Cochabamba,Cochabamba,,,,1,2021-0138-BOL,"POLYGON ((-67.00473 -16.95181, -67.00468 -16.9..."


In [93]:
data1 = gpd.read_file('geolocated_events_clean_2021-2023.gpkg')

In [94]:
data1[data1.iso3 == 'BOL']

Unnamed: 0,DisNo.,iso3,official_name,ADM0_CODE,ADM0_NAME,ADM1_CODE,ADM1_NAME,location,geolocation,ADM2_CODE,ADM2_NAME,level,disaster_number_country,geometry
5,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40443,Beni,Beni,Beni,,,1,2021-0001-BOL,"POLYGON ((-67.56518 -14.04564, -67.56469 -14.0..."
6,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40445,Cochabamba,Cochabamba,Cochabamba,,,1,2021-0001-BOL,"POLYGON ((-67.00473 -16.95181, -67.00468 -16.9..."
7,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40446,La Paz,La Paz Departments,La Paz,,,1,2021-0001-BOL,"POLYGON ((-69.64076 -17.28202, -69.63731 -17.2..."
8,2021-0001-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40450,Tarija,Tarija,Tarija,,,1,2021-0001-BOL,"POLYGON ((-65.41817 -21.89328, -65.41816 -21.8..."
77,2021-0027-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40444,Chuquisaca,Sucre City (Chuquisaca Department),Oropeza,40458.0,Oropeza,2,2021-0027-BOL,"POLYGON ((-65.69396 -18.35985, -65.69388 -18.3..."
89,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40444,Chuquisaca,Cochabamba Department,Oropeza,40458.0,Oropeza,2,2021-0035-BOL,"POLYGON ((-65.69396 -18.35985, -65.69388 -18.3..."
90,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40445,Cochabamba,Cochabamba Department,Cochabamba,,,1,2021-0035-BOL,"POLYGON ((-67.00473 -16.95181, -67.00468 -16.9..."
91,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40446,La Paz,Paz Department,La Paz,,,1,2021-0035-BOL,"POLYGON ((-69.64076 -17.28202, -69.63731 -17.2..."
92,2021-0035-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40448,Potosi,Potosì Department,Potosi,,,1,2021-0035-BOL,"POLYGON ((-68.78247 -20.08777, -68.78247 -20.0..."
212,2021-0109-BOL,BOL,Plurinational State of Bolivia,33,Bolivia,40449,Santa Cruz,Beni Departments,Santa Cruz,,,1,2021-0109-BOL,"POLYGON ((-64.83822 -16.73392, -64.83808 -16.7..."
