In [1]:
import pandas as pd 
import os, glob
import datetime
import fiona
from shapely.geometry import Point, shape, Polygon
import geopandas as gpd
import shapely.speedups
import tqdm

  shapely_geos_version, geos_capi_version_string


# US-GS dataset

We use the [US-GS dataset](https://www.usgs.gov/programs/earthquake-hazards/earthquakes) to acquire overall information about earthquakes. In order to determine a country for this dataset, which only contains coordinates of events, we merge it to another dataset which contains country codes and polygons for each country. The data only contains earthquakes registered past 1959, in order to match the dates of the other datasets used in this project.

In [2]:
# Dataset with countries geometry and code

countries_df = gpd.read_file("ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp")

In [3]:
countries_df=countries_df.rename(columns={'SOV_A3':'country code'})

In [4]:
countries_df.head()

Unnamed: 0,featurecla,scalerank,LABELRANK,SOVEREIGNT,country code,ADM0_DIF,LEVEL,TYPE,ADMIN,ADM0_A3,...,FCLASS_TR,FCLASS_ID,FCLASS_PL,FCLASS_GR,FCLASS_IT,FCLASS_NL,FCLASS_SE,FCLASS_BD,FCLASS_UA,geometry
0,Admin-0 country,0,2,Indonesia,IDN,0,2,Sovereign country,Indonesia,IDN,...,,,,,,,,,,"MULTIPOLYGON (((117.70361 4.16341, 117.70361 4..."
1,Admin-0 country,0,3,Malaysia,MYS,0,2,Sovereign country,Malaysia,MYS,...,,,,,,,,,,"MULTIPOLYGON (((117.70361 4.16341, 117.69711 4..."
2,Admin-0 country,0,2,Chile,CHL,0,2,Sovereign country,Chile,CHL,...,,,,,,,,,,"MULTIPOLYGON (((-69.51009 -17.50659, -69.50611..."
3,Admin-0 country,0,3,Bolivia,BOL,0,2,Sovereign country,Bolivia,BOL,...,,,,,,,,,,"POLYGON ((-69.51009 -17.50659, -69.51009 -17.5..."
4,Admin-0 country,0,2,Peru,PER,0,2,Sovereign country,Peru,PER,...,,,,,,,,,,"MULTIPOLYGON (((-69.51009 -17.50659, -69.63832..."


In [5]:
df_countries_usgs=pd.read_csv('merged_countries.csv',sep=',')

In [6]:

df_geo_usgs= gpd.GeoDataFrame(df_countries_usgs, geometry=gpd.points_from_xy(df_countries_usgs.longitude, df_countries_usgs.latitude))

In [7]:
df_geo_usgs.head()

Unnamed: 0.1,Unnamed: 0,geometry,souvereign,admin,index_right,time,latitude,longitude,depth,mag,id,place,Date
0,0,POINT (-74.46600 -8.49800),Peru,Peru,65172,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25
1,13753,POINT (-74.46600 -8.49800),Peru,Peru,65172,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25
2,0,POINT (-74.46600 -8.49800),Peru,Peru,1,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25
3,13753,POINT (-74.46600 -8.49800),Peru,Peru,1,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25
4,1,POINT (110.73600 35.51500),China,China,12,2010-01-24 00:00:00+00:00,35.515,110.736,28.7,5.0,usp000h6j0,"28 km ENE of Hancheng, China",2010-01-24


In [8]:
df_geo_usgs=df_geo_usgs.drop(columns=['Unnamed: 0', 'index_right'])

drop_countries = list(set(countries_df.columns)-set(['SOVEREIGNT','country code', 'geometry']))

countries_df=countries_df.drop(columns=drop_countries)


In [9]:
#df_country_codes = gpd.sjoin(usgs_country_code, df_geo_usgs, how="inner")
df_usgs_codes = gpd.sjoin(df_geo_usgs, countries_df, how="inner")

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  


In [10]:
df_usgs_codes.head()

Unnamed: 0,geometry,souvereign,admin,time,latitude,longitude,depth,mag,id,place,Date,index_right,SOVEREIGNT,country code
0,POINT (-74.46600 -8.49800),Peru,Peru,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25,4,Peru,PER
1,POINT (-74.46600 -8.49800),Peru,Peru,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25,4,Peru,PER
2,POINT (-74.46600 -8.49800),Peru,Peru,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25,4,Peru,PER
3,POINT (-74.46600 -8.49800),Peru,Peru,2010-01-25 00:00:00+00:00,-8.498,-74.466,146.7,5.9,usp000h6md,"13 km SE of San Fernando, Peru",2010-01-25,4,Peru,PER
97,POINT (-77.71800 -8.80200),Peru,Peru,2010-01-03 00:00:00+00:00,-8.802,-77.718,116.8,5.7,usp000h5na,"16 km E of Huallanca, Peru",2010-01-03,4,Peru,PER


In [11]:
#drop earthquake duplicates US-GS dataset
print(len(df_usgs_codes))
df_usgs_codes.drop_duplicates(inplace=True)
print(len(df_usgs_codes))

17090
17006


In [22]:
df_usgs_codes =df_usgs_codes.rename(columns={'SOVEREIGNT':'country', 'Date':'date'})
df_usgs_clean=df_usgs_codes.drop(columns=['index_right','geometry', 'souvereign', 'admin', 'latitude', 'longitude', 'id', 'place', 'index_right', 'time'])

df_usgs_clean.head()

Unnamed: 0,depth,mag,date,country,country code
0,146.7,5.9,2010-01-25,Peru,PER
97,116.8,5.7,2010-01-03,Peru,PER
110,59.1,5.6,2009-12-24,Peru,PER
175,62.2,5.0,2009-10-15,Peru,PER
212,210.2,5.8,2009-09-05,Peru,PER


In [13]:
df_usgs_clean.to_csv(r'df_usgs_clean.csv')

# EMDAT Dataset

Now we merge the data to the [EMDAT dataset](https://www.emdat.be/) for international disasters which was already cleaned to only have Earthquake data. 


In [14]:
df_emdat=pd.read_csv('emdat_date.csv',sep=',')
df_emdat.head(100)

Unnamed: 0.1,Unnamed: 0,Dis No,Year,Month,Day,Disaster Type,Disaster Subtype,Country,Country Code,Region,...,No Homeless,Total Affected,Reconstruction Costs ('000 US$),"Reconstruction Costs, Adjusted ('000 US$)",Insured Damages ('000 US$),"Insured Damages, Adjusted ('000 US$)",Total Damages ('000 US$),"Total Damages, Adjusted ('000 US$)",CPI,myDt
0,63,1960-0013-CHL,1960,5.0,22.0,Earthquake,Tsunami,Chile,CHL,South America,...,,2003000.0,,,,,550000.0,4813056.0,11.427251,1960-05-22
1,64,1960-0033-DZA,1960,2.0,21.0,Earthquake,Ground movement,Algeria,DZA,Northern Africa,...,1250.0,1250.0,,,,,,,11.427251,1960-02-21
2,65,1961-0016-ETH,1961,6.0,2.0,Earthquake,Ground movement,Ethiopia,ETH,Eastern Africa,...,,,,,,,,,11.549601,1961-06-02
3,66,1962-0044-ALB,1962,3.0,18.0,Earthquake,Ground movement,Albania,ALB,Southern Europe,...,,154.0,,,,,,,11.688060,1962-03-18
4,67,1962-0030-COL,1962,7.0,30.0,Earthquake,Ground movement,Colombia,COL,South America,...,,300.0,,,,,,,11.688060,1962-07-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,265,1971-0034-TUR,1971,5.0,22.0,Earthquake,Ground movement,Turkey,TUR,Western Asia,...,27465.0,88665.0,,,,,5000.0,31959.0,15.645257,1971-05-22
96,266,1971-0016-USA,1971,2.0,9.0,Earthquake,Ground movement,United States of America (the),USA,Northern America,...,,2000.0,,,35000.0,223710.0,553000.0,3534617.0,15.645257,1971-02-09
97,267,1972-0027-IRN,1972,4.0,10.0,Earthquake,Ground movement,Iran (Islamic Republic of),IRN,Southern Asia,...,,23458.0,,,,,1000.0,6189.0,16.157212,1972-04-10
98,268,1972-0084-ITA,1972,2.0,4.0,Earthquake,Ground movement,Italy,ITA,Southern Europe,...,,450.0,,,,,,,16.157212,1972-02-04


In [15]:
df_emdat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 35 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Unnamed: 0                                 1235 non-null   int64  
 1   Dis No                                     1235 non-null   object 
 2   Year                                       1235 non-null   int64  
 3   Month                                      1235 non-null   float64
 4   Day                                        1235 non-null   float64
 5   Disaster Type                              1235 non-null   object 
 6   Disaster Subtype                           1233 non-null   object 
 7   Country                                    1235 non-null   object 
 8   Country Code                               1235 non-null   object 
 9   Region                                     1235 non-null   object 
 10  Continent               

In [16]:
# copy dataset for preprocessing

df_emdat_processing = df_emdat.copy()

In [17]:
df_emdat_processing.columns

Index(['Unnamed: 0', 'Dis No', 'Year', 'Month', 'Day', 'Disaster Type',
       'Disaster Subtype', 'Country', 'Country Code', 'Region', 'Continent',
       'Location', 'Associated Dis', 'Associated Dis2', 'OFDA Response',
       'Appeal', 'Declaration', 'Aid Contribution', 'Dis Mag Value',
       'Latitude', 'Longitude', 'Local Time', 'Total Deaths', 'No Injured',
       'No Affected', 'No Homeless', 'Total Affected',
       'Reconstruction Costs ('000 US$)',
       'Reconstruction Costs, Adjusted ('000 US$)',
       'Insured Damages ('000 US$)', 'Insured Damages, Adjusted ('000 US$)',
       'Total Damages ('000 US$)', 'Total Damages, Adjusted ('000 US$)', 'CPI',
       'myDt'],
      dtype='object')

In [18]:
drop_emdat = list(set(df_emdat_processing.columns)-
                  set(['Year','Country', 'Country Code', 'Continent', 
                       'Total Affected','CPI','myDt']))

df_emdat_processing = df_emdat_processing.drop(columns=drop_emdat)
df_emdat_processing.head()

Unnamed: 0,Year,Country,Country Code,Continent,Total Affected,CPI,myDt
0,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22
1,1960,Algeria,DZA,Africa,1250.0,11.427251,1960-02-21
2,1961,Ethiopia,ETH,Africa,,11.549601,1961-06-02
3,1962,Albania,ALB,Europe,154.0,11.68806,1962-03-18
4,1962,Colombia,COL,Americas,300.0,11.68806,1962-07-30


In [19]:
# Dropping missing vaues for Total affected

print(len(df_emdat_processing))
df_emdat_processing.dropna(subset=['Total Affected'], inplace=True)
print(len(df_emdat_processing))

1235
1136


In [25]:
df_emdat_processing =df_emdat_processing.rename(columns={'myDt':'date', 'Country Code':'country code'})

df_emdat_clean = df_emdat_processing
df_emdat_clean.head()

Unnamed: 0,Year,Country,country code,Continent,Total Affected,CPI,date
0,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22
1,1960,Algeria,DZA,Africa,1250.0,11.427251,1960-02-21
3,1962,Albania,ALB,Europe,154.0,11.68806,1962-03-18
4,1962,Colombia,COL,Americas,300.0,11.68806,1962-07-30
5,1964,Azores Islands,AZO,Europe,1000.0,11.984281,1964-02-18


In [31]:
df_merged_emdat_usgs= df_emdat_clean.merge(df_usgs_clean, how= 'inner', on=['date', 'country code'])


In [36]:
df_merged_emdat_usgs.head()

Unnamed: 0,Year,Country,country code,Continent,Total Affected,CPI,date,depth,mag,country
0,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22,25.0,9.5,Chile
1,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22,25.0,7.8,Chile
2,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22,25.0,6.8,Chile
3,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22,25.0,7.1,Chile
4,1960,Chile,CHL,Americas,2003000.0,11.427251,1960-05-22,25.0,5.8,Chile


In [39]:
df_merged_emdat_usgs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1113 entries, 0 to 1112
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            1113 non-null   int64  
 1   Country         1113 non-null   object 
 2   country code    1113 non-null   object 
 3   Continent       1113 non-null   object 
 4   Total Affected  1113 non-null   float64
 5   CPI             1113 non-null   float64
 6   date            1113 non-null   object 
 7   depth           1113 non-null   float64
 8   mag             1113 non-null   float64
 9   country         1113 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 95.6+ KB


In [40]:
df_usgs_clean.to_csv('merged_usgs_emdat.csv')

# World Bank Indicators

Using the [Word Bank Indicators (WBI) dataset](https://databank.worldbank.org/source/world-development-indicators), we explore different socioeconomic aspects of countries. For the sake of having only a prototype, we explore only a few indicators which can refer to more general aspects of the country infrastructure.

In [43]:
countries = list(df_usgs_clean.country.unique())
print(countries)

['Peru', 'China', 'Costa Rica', 'Afghanistan', 'Bolivia', 'Panama', 'Greece', 'New Zealand', 'Haiti', 'Mexico', 'Argentina', 'Kyrgyzstan', 'Iran', 'Venezuela', 'Philippines', 'Japan', 'Indonesia', 'Guatemala', 'Papua New Guinea', 'Solomon Islands', 'Tajikistan', 'Bhutan', 'Chile', 'Myanmar', 'Ethiopia', 'Malawi', 'Colombia', 'Russia', 'Democratic Republic of the Congo', 'Taiwan', 'Ecuador', 'Vanuatu', 'United States of America', 'United Republic of Tanzania', 'India', 'Georgia', 'Albania', 'Namibia', 'Turkey', 'Pakistan', 'Iraq', 'Honduras', 'Kazakhstan', 'Brazil', 'Turkmenistan', 'North Macedonia', 'Saudi Arabia', 'Uzbekistan', 'Romania', 'Italy', 'Canada', 'South Africa', 'Dominican Republic', 'Nepal', 'East Timor', 'Antarctica', 'Iceland', 'Malaysia', 'Lebanon', 'Rwanda', 'Mozambique', 'Mongolia', 'Nicaragua', 'Bangladesh', 'France', 'Azerbaijan', 'Laos', 'El Salvador', 'Zambia', 'Uganda', 'Algeria', 'Bosnia and Herzegovina', 'Jamaica', 'Dominica', 'Morocco', 'Trinidad and Tobago', 

In [None]:
tbc