In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Data Ingestion and Exploration

In [3]:
sigvol = pd.read_csv('significantvolcanoeruptions.csv')
print(sigvol.head())

     Year  Month  Day Associated Tsunami? Associated Earthquake?  \
0     NaN    NaN  NaN                 NaN                    NaN   
1 -4360.0    NaN  NaN                 NaN                    NaN   
2 -4350.0    NaN  NaN                 NaN                    NaN   
3 -4050.0    NaN  NaN                 NaN                    NaN   
4 -4000.0    NaN  NaN                 NaN                    NaN   

              Name            Location           Country  Latitude  Longitude  \
0              NaN                 NaN               NaN       NaN        NaN   
1  Macauley Island         Kermadec Is       New Zealand   -30.200   -178.470   
2            Kikai           Ryukyu Is             Japan    30.780    130.280   
3           Masaya           Nicaragua         Nicaragua    11.984    -86.161   
4             Pago  New Britain-SW Pac  Papua New Guinea    -5.580    150.520   

   ...  TOTAL_DEATHS TOTAL_DEATHS_DESCRIPTION TOTAL_MISSING  \
0  ...           NaN                     

In [215]:
# drop first row as its blank
sigvol = sigvol.drop([0])
print(sigvol.head())

     Year Associated Tsunami? Associated Earthquake?             Name  \
1 -4360.0                 NaN                    NaN  Macauley Island   
2 -4350.0                 NaN                    NaN            Kikai   
3 -4050.0                 NaN                    NaN           Masaya   
4 -4000.0                 NaN                    NaN             Pago   
5 -3580.0                 NaN                    NaN             Taal   

             Location           Country  Latitude  Longitude  Elevation  \
1         Kermadec Is       New Zealand   -30.200   -178.470      238.0   
2           Ryukyu Is             Japan    30.780    130.280      717.0   
3           Nicaragua         Nicaragua    11.984    -86.161      635.0   
4  New Britain-SW Pac  Papua New Guinea    -5.580    150.520      742.0   
5   Luzon-Philippines       Philippines    14.002    120.993      400.0   

            Type      Status Time  Volcano Explosivity Index (VEI)  DEATHS  
1        Caldera    Holocene    U

In [216]:
sigvol.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 658 entries, 1 to 658
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             658 non-null    float64
 1   Associated Tsunami?              133 non-null    object 
 2   Associated Earthquake?           55 non-null     object 
 3   Name                             658 non-null    object 
 4   Location                         658 non-null    object 
 5   Country                          658 non-null    object 
 6   Latitude                         658 non-null    float64
 7   Longitude                        658 non-null    float64
 8   Elevation                        658 non-null    float64
 9   Type                             658 non-null    object 
 10  Status                           658 non-null    object 
 11  Time                             658 non-null    object 
 12  Volcano Explosivity In

In [217]:
sigvol.isnull().sum()

Year                                 0
Associated Tsunami?                525
Associated Earthquake?             603
Name                                 0
Location                             0
Country                              0
Latitude                             0
Longitude                            0
Elevation                            0
Type                                 0
Status                               0
Time                                 0
Volcano Explosivity Index (VEI)     88
DEATHS                             348
dtype: int64

# Data Cleaning

## Replace missing VEI values with mean

In [218]:
# Replace missing values in VEI with the mean of remaining items
VEImean = round(sigvol['Volcano Explosivity Index (VEI)'].mean())
print(VEImean)
sigvol['Volcano Explosivity Index (VEI)'] = sigvol['Volcano Explosivity Index (VEI)'].fillna(VEImean)
print(sigvol['Volcano Explosivity Index (VEI)'].value_counts())


3
3.0    264
2.0    197
4.0     85
6.0     37
1.0     36
5.0     21
0.0     15
7.0      3
Name: Volcano Explosivity Index (VEI), dtype: int64


## Replace Tsunami and Earthquake data

In [219]:
# replace missing data with 0 and valid data with 1 in Tsunami and Earthquake
sigvol['Associated Tsunami?'] = sigvol['Associated Tsunami?'].fillna(0)
sigvol['Associated Tsunami?'].replace({"TSU": 1}, inplace=True)

sigvol['Associated Earthquake?'] = sigvol['Associated Earthquake?'].fillna(0)
sigvol['Associated Earthquake?'].replace({"EQ": 1}, inplace=True)

In [220]:
print(sigvol['Associated Tsunami?'].value_counts())
print(sigvol['Associated Earthquake?'].value_counts())

0    525
1    133
Name: Associated Tsunami?, dtype: int64
0    603
1     55
Name: Associated Earthquake?, dtype: int64


## Discover correlation to pick columns to do prediction with

In [221]:
# determine columns the correlate with DEATHS column
sigvol.corr()

Unnamed: 0,Year,Associated Tsunami?,Associated Earthquake?,Latitude,Longitude,Elevation,Volcano Explosivity Index (VEI),DEATHS
Year,1.0,0.043428,0.03048,-0.068468,0.100186,0.134471,-0.499346,-0.274986
Associated Tsunami?,0.043428,1.0,0.340238,0.012807,-0.007064,-0.31439,-0.030593,0.151225
Associated Earthquake?,0.03048,0.340238,1.0,0.028158,-0.021976,-0.039626,0.028007,0.016537
Latitude,-0.068468,0.012807,0.028158,1.0,-0.186688,-0.077874,0.069808,-0.028197
Longitude,0.100186,-0.007064,-0.021976,-0.186688,1.0,-0.24548,-0.107419,-0.136751
Elevation,0.134471,-0.31439,-0.039626,-0.077874,-0.24548,1.0,-0.052623,0.008834
Volcano Explosivity Index (VEI),-0.499346,-0.030593,0.028007,0.069808,-0.107419,-0.052623,1.0,0.297494
DEATHS,-0.274986,0.151225,0.016537,-0.028197,-0.136751,0.008834,0.297494,1.0


Columns with correlation:

Tsunami, Longitude and VEI chosen to keep. Year was considered but the reason for correlation with Year would be because the development of record keeping rather than a reason behind the volcano erruption itself.   
Longitude mean be correlated to define location on earth and VEI definitely is a good option. Tsunami I was on the fence about because of its imbalance but decided to keep it.


In [222]:
# Remove all other columns except DEATHS, Longitude, Tsunami and VEI.
msigvol = sigvol[['Associated Tsunami?', 'Longitude', 'Volcano Explosivity Index (VEI)', 'DEATHS']].copy()

In [223]:
print(msigvol.head())

   Associated Tsunami?  Longitude  Volcano Explosivity Index (VEI)  DEATHS
1                    0   -178.470                              6.0     NaN
2                    0    130.280                              7.0     NaN
3                    0    -86.161                              6.0     NaN
4                    0    150.520                              6.0     NaN
5                    0    120.993                              6.0     NaN


## Prep data for Linear Regression

In [224]:
# separate rows having null or missing data for prediction
testd = msigvol[msigvol['DEATHS'].isnull()]
print(testd.head())
print(testd.shape)

   Associated Tsunami?  Longitude  Volcano Explosivity Index (VEI)  DEATHS
1                    0   -178.470                              6.0     NaN
2                    0    130.280                              7.0     NaN
3                    0    -86.161                              6.0     NaN
4                    0    150.520                              6.0     NaN
5                    0    120.993                              6.0     NaN
(348, 4)


In [225]:
# drop null values from orig df
msigvol = msigvol.dropna()
print(msigvol)
print(msigvol.shape)

     Associated Tsunami?  Longitude  Volcano Explosivity Index (VEI)   DEATHS
21                     0     15.004                              3.0     40.0
27                     0     14.426                              5.0   3500.0
33                     0    -89.053                              6.0  30000.0
39                     0    130.670                              4.0     80.0
70                     0    -16.650                              5.0    220.0
..                   ...        ...                              ...      ...
649                    0    123.685                              3.0      5.0
650                    0    121.708                              3.0      5.0
654                    0     98.392                              3.0     17.0
655                    0    112.308                              3.0      7.0
656                    0    137.480                              3.0     55.0

[310 rows x 4 columns]
(310, 4)


In [226]:
y_train = msigvol['DEATHS']
X_train = msigvol.drop("DEATHS", axis=1)
X_test = testd.drop("DEATHS", axis=1)
print(X_train.head())


    Associated Tsunami?  Longitude  Volcano Explosivity Index (VEI)
21                    0     15.004                              3.0
27                    0     14.426                              5.0
33                    0    -89.053                              6.0
39                    0    130.670                              4.0
70                    0    -16.650                              5.0


## Model training and results

In [227]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [228]:
y_pred = lr.predict(X_test)
sigvol.loc[sigvol.DEATHS.isnull(), 'DEATHS'] = y_pred

In [229]:
print(sigvol['DEATHS'])
print(min(sigvol['DEATHS']))

1      3425.666945
2      3210.825263
3      3158.446917
4      2473.292671
5      2558.768689
          ...     
654      17.000000
655       7.000000
656      55.000000
657    1321.747349
658     942.690866
Name: DEATHS, Length: 658, dtype: float64
-1489.7354701161385


## Clean up resulting DEATHS data to replace negatives with 0

In [230]:
# convert the negative values for DEATHS to zeros
num = sigvol['DEATHS']._get_numeric_data()
num[num < 0] = 0

sigvol['DEATHS'] = sigvol['DEATHS'].astype(int)
print(sigvol['DEATHS'])
print(min(sigvol['DEATHS']))


1      3425
2      3210
3      3158
4      2473
5      2558
       ... 
654      17
655       7
656      55
657    1321
658     942
Name: DEATHS, Length: 658, dtype: int64
0


In [231]:
print(sigvol.head())

     Year  Associated Tsunami?  Associated Earthquake?             Name  \
1 -4360.0                    0                       0  Macauley Island   
2 -4350.0                    0                       0            Kikai   
3 -4050.0                    0                       0           Masaya   
4 -4000.0                    0                       0             Pago   
5 -3580.0                    0                       0             Taal   

             Location           Country  Latitude  Longitude  Elevation  \
1         Kermadec Is       New Zealand   -30.200   -178.470      238.0   
2           Ryukyu Is             Japan    30.780    130.280      717.0   
3           Nicaragua         Nicaragua    11.984    -86.161      635.0   
4  New Britain-SW Pac  Papua New Guinea    -5.580    150.520      742.0   
5   Luzon-Philippines       Philippines    14.002    120.993      400.0   

            Type      Status Time  Volcano Explosivity Index (VEI)  DEATHS  
1        Caldera    H

## Create new clean CSV

In [234]:
sigvol.to_csv('sigvol-clean.csv', index=False)

In [None]:
y_train2 = msigvol['DEATHS']
X_train2 = msigvol.drop("DEATHS", axis=1)
X_test2 = testd.drop("DEATHS", axis=1)
print(X_train.head())