# ETL of secondary datasets

# "Volcano Eruptions"

In [2]:
import numpy as np
import pandas as pd

In [20]:
erup = pd.read_csv('Datasets\Input\eruptions.csv', sep=',')

In [21]:
erup

Unnamed: 0,volcano_number,volcano_name,eruption_number,eruption_category,area_of_activity,vei,start_year,start_month,start_day,evidence_method_dating,end_year,end_month,end_day,latitude,longitude
0,266030,Soputan,22354,Confirmed Eruption,,,2020.0,3.0,23.0,Historical Observations,2020.0,4.0,2.0,1.112,124.737
1,343100,San Miguel,22355,Confirmed Eruption,,,2020.0,2.0,22.0,Historical Observations,2020.0,2.0,22.0,13.434,-88.269
2,233020,"Fournaise, Piton de la",22343,Confirmed Eruption,,,2020.0,2.0,10.0,Historical Observations,2020.0,4.0,6.0,-21.244,55.708
3,345020,Rincon de la Vieja,22346,Confirmed Eruption,,,2020.0,1.0,31.0,Historical Observations,2020.0,4.0,17.0,10.830,-85.324
4,353010,Fernandina,22347,Confirmed Eruption,,,2020.0,1.0,12.0,Historical Observations,2020.0,1.0,12.0,-0.370,-91.550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11173,324020,Craters of the Moon,21101,Confirmed Eruption,Sunset cone,0.0,-10060.0,,,Radiocarbon (uncorrected),,,,43.420,-113.500
11174,222161,Igwisi Hills,22141,Confirmed Eruption,NE Volcano,1.0,-10450.0,,,Surface Exposure,,,,-4.889,31.933
11175,357121,Quetrupillan,22351,Confirmed Eruption,,3.0,-10658.0,,,Radiocarbon (corrected),,,,-39.496,-71.722
11176,357121,Quetrupillan,22352,Confirmed Eruption,,3.0,-11345.0,,,Radiocarbon (corrected),,,,-39.496,-71.722


In [22]:
erup['eruption_category'].value_counts()

Confirmed Eruption      9900
Uncertain Eruption      1112
Discredited Eruption     166
Name: eruption_category, dtype: int64

In [23]:
erup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11178 entries, 0 to 11177
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   volcano_number          11178 non-null  int64  
 1   volcano_name            11178 non-null  object 
 2   eruption_number         11178 non-null  int64  
 3   eruption_category       11178 non-null  object 
 4   area_of_activity        4694 non-null   object 
 5   vei                     8272 non-null   float64
 6   start_year              11177 non-null  float64
 7   start_month             10985 non-null  float64
 8   start_day               10982 non-null  float64
 9   evidence_method_dating  9898 non-null   object 
 10  end_year                4332 non-null   float64
 11  end_month               4329 non-null   float64
 12  end_day                 4326 non-null   float64
 13  latitude                11178 non-null  float64
 14  longitude               11178 non-null

In [24]:
# We set datetime properties for a new column 'Start Date': 

erup.rename(columns={'volcano_number' : 'Volcano ID',
                    'start_year' : 'Year',
                   'start_month' : 'Month',
                   'start_day' : 'Day'}, inplace=True)


erup ['Start Date'] = pd.to_datetime (erup [["Year", "Month", "Day"]],  errors = 'coerce')

In [25]:
# And for 'End Date': 

erup = erup.drop (['Year', 'Month', 'Day'], axis=1)

erup.rename(columns={'end_year' : 'Year',
                   'end_month' : 'Month',
                   'end_day' : 'Day'}, inplace=True)


erup ['End Date'] = pd.to_datetime (erup [["Year", "Month", "Day"]],  errors = 'coerce')

In [26]:
erup = erup.drop (['Year', 'Month', 'Day'], axis=1)

In [27]:
erup.columns

Index(['Volcano ID', 'volcano_name', 'eruption_number', 'eruption_category',
       'area_of_activity', 'vei', 'evidence_method_dating', 'latitude',
       'longitude', 'Start Date', 'End Date'],
      dtype='object')

In [29]:
# Rearrange columns order:

erup = erup [['Volcano ID', 'volcano_name', 'eruption_number',
       'eruption_category', 'Start Date',
       'End Date', 'area_of_activity', 'vei',
       'evidence_method_dating', 'latitude', 'longitude']]

In [30]:
# Dropping of unnecesary columns

erup = erup.drop (['area_of_activity', 'evidence_method_dating'], axis=1)

In [31]:
# Final view of finished dataframe

erup

Unnamed: 0,Volcano ID,volcano_name,eruption_number,eruption_category,Start Date,End Date,vei,latitude,longitude
0,266030,Soputan,22354,Confirmed Eruption,2020-03-23,2020-04-02,,1.112,124.737
1,343100,San Miguel,22355,Confirmed Eruption,2020-02-22,2020-02-22,,13.434,-88.269
2,233020,"Fournaise, Piton de la",22343,Confirmed Eruption,2020-02-10,2020-04-06,,-21.244,55.708
3,345020,Rincon de la Vieja,22346,Confirmed Eruption,2020-01-31,2020-04-17,,10.830,-85.324
4,353010,Fernandina,22347,Confirmed Eruption,2020-01-12,2020-01-12,,-0.370,-91.550
...,...,...,...,...,...,...,...,...,...
11173,324020,Craters of the Moon,21101,Confirmed Eruption,NaT,NaT,0.0,43.420,-113.500
11174,222161,Igwisi Hills,22141,Confirmed Eruption,NaT,NaT,1.0,-4.889,31.933
11175,357121,Quetrupillan,22351,Confirmed Eruption,NaT,NaT,3.0,-39.496,-71.722
11176,357121,Quetrupillan,22352,Confirmed Eruption,NaT,NaT,3.0,-39.496,-71.722


In [32]:
erup.to_csv ('Datasets\Output\Eruptions-Database.csv')

## Volcano Dataset:

In [35]:
volc = pd.read_csv('Datasets\Input\\volcano.csv', sep=',')
volc

Unnamed: 0,volcano_number,volcano_name,primary_volcano_type,last_eruption_year,country,region,subregion,latitude,longitude,elevation,...,major_rock_5,minor_rock_1,minor_rock_2,minor_rock_3,minor_rock_4,minor_rock_5,population_within_5_km,population_within_10_km,population_within_30_km,population_within_100_km
0,283001,Abu,Shield(s),-6850,Japan,"Japan, Taiwan, Marianas",Honshu,34.500,131.600,641,...,,,,,,,3597,9594,117805,4071152
1,355096,Acamarachi,Stratovolcano,Unknown,Chile,South America,"Northern Chile, Bolivia and Argentina",-23.292,-67.618,6023,...,,,,,,,0,7,294,9092
2,342080,Acatenango,Stratovolcano(es),1972,Guatemala,México and Central America,Guatemala,14.501,-90.876,3976,...,,Basalt / Picro-Basalt,,,,,4329,60730,1042836,7634778
3,213004,Acigol-Nevsehir,Caldera,-2080,Turkey,Mediterranean and Western Asia,Turkey,38.537,34.621,1683,...,,,,,,,127863,127863,218469,2253483
4,321040,Adams,Stratovolcano,950,United States,Canada and Western USA,USA (Washington),46.206,-121.490,3742,...,,Dacite,,,,,0,70,4019,393303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,300242,Zimina,Stratovolcano(es),Unknown,Russia,Kamchatka and Mainland Asia,Kamchatka Peninsula,55.862,160.603,3057,...,,,,,,,0,0,77,11184
954,341061,Zitacuaro-Valle de Bravo,Caldera,-3050,Mexico,México and Central America,Mexico,19.400,-100.250,3500,...,,Trachyandesite / Basaltic Trachyandesite,Trachybasalt / Tephrite Basanite,,,,231977,231977,490440,4838069
955,221020,Zubair Group,Shield,2013,Yemen,Africa and Red Sea,Africa (northeastern) and Red Sea,15.050,42.180,191,...,,,,,,,7,7,7,671171
956,221021,Zukur,Shield,Unknown,Yemen,Africa and Red Sea,Africa (northeastern) and Red Sea,14.020,42.750,624,...,,,,,,,508,922,1075,1472800


In [36]:
volc.columns

Index(['volcano_number', 'volcano_name', 'primary_volcano_type',
       'last_eruption_year', 'country', 'region', 'subregion', 'latitude',
       'longitude', 'elevation', 'tectonic_settings', 'evidence_category',
       'major_rock_1', 'major_rock_2', 'major_rock_3', 'major_rock_4',
       'major_rock_5', 'minor_rock_1', 'minor_rock_2', 'minor_rock_3',
       'minor_rock_4', 'minor_rock_5', 'population_within_5_km',
       'population_within_10_km', 'population_within_30_km',
       'population_within_100_km'],
      dtype='object')

In [39]:
volc['evidence_category'].value_counts()

Eruption Observed     428
Eruption Dated        226
Evidence Credible     193
Evidence Uncertain     84
Unrest / Holocene      27
Name: evidence_category, dtype: int64

In [41]:
# Dropping of unnecesary columns:

volc = volc.drop(['tectonic_settings', 'evidence_category',
       'major_rock_1', 'major_rock_2', 'major_rock_3', 'major_rock_4',
       'major_rock_5', 'minor_rock_1', 'minor_rock_2', 'minor_rock_3',
       'minor_rock_4', 'minor_rock_5'], axis=1)

In [43]:
# Rename of the primary column:

volc.rename(columns= {'volcano_number':'Volcano ID'}, inplace=True)

In [44]:
volc

Unnamed: 0,Volcano ID,volcano_name,primary_volcano_type,last_eruption_year,country,region,subregion,latitude,longitude,elevation,population_within_5_km,population_within_10_km,population_within_30_km,population_within_100_km
0,283001,Abu,Shield(s),-6850,Japan,"Japan, Taiwan, Marianas",Honshu,34.500,131.600,641,3597,9594,117805,4071152
1,355096,Acamarachi,Stratovolcano,Unknown,Chile,South America,"Northern Chile, Bolivia and Argentina",-23.292,-67.618,6023,0,7,294,9092
2,342080,Acatenango,Stratovolcano(es),1972,Guatemala,México and Central America,Guatemala,14.501,-90.876,3976,4329,60730,1042836,7634778
3,213004,Acigol-Nevsehir,Caldera,-2080,Turkey,Mediterranean and Western Asia,Turkey,38.537,34.621,1683,127863,127863,218469,2253483
4,321040,Adams,Stratovolcano,950,United States,Canada and Western USA,USA (Washington),46.206,-121.490,3742,0,70,4019,393303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953,300242,Zimina,Stratovolcano(es),Unknown,Russia,Kamchatka and Mainland Asia,Kamchatka Peninsula,55.862,160.603,3057,0,0,77,11184
954,341061,Zitacuaro-Valle de Bravo,Caldera,-3050,Mexico,México and Central America,Mexico,19.400,-100.250,3500,231977,231977,490440,4838069
955,221020,Zubair Group,Shield,2013,Yemen,Africa and Red Sea,Africa (northeastern) and Red Sea,15.050,42.180,191,7,7,7,671171
956,221021,Zukur,Shield,Unknown,Yemen,Africa and Red Sea,Africa (northeastern) and Red Sea,14.020,42.750,624,508,922,1075,1472800


In [45]:
volc.to_csv ('Datasets\Output\\Volcano-Database.csv')