In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
script_dir = os.path.dirname(os.path.abspath(os.getcwd()))
file_path = os.path.join(script_dir, "raw_datasets", "volcanic_activity.xlsx")
df = pd.read_excel(file_path)

In [3]:
df.shape

(129, 46)

In [4]:
df.rename(columns={df.columns[0]: 'Year'}, inplace=True)
required_columns = ['Year', 'Disaster Type', 'Country', 'Location', 'Total Affected', 'Magnitude']
df = df[required_columns]
df['Year'] = df['Year'].astype(str).str[:4]
df['Location'] = df['Location'].str.split(',').str[0]

In [5]:
df.isnull().sum()

Year                0
Disaster Type       0
Country             0
Location            2
Total Affected      9
Magnitude         129
dtype: int64

In [6]:
df

Unnamed: 0,Year,Disaster Type,Country,Location,Total Affected,Magnitude
0,2000,Volcanic activity,Guatemala,El Caracol,800.0,
1,2000,Volcanic activity,Philippines,Legazpi City area (Albay district,68426.0,
2,2000,Volcanic activity,Japan,Abutatyoo,12400.0,
3,2000,Volcanic activity,Japan,Miyakemura district (Tookyoo province),4000.0,
4,2000,Volcanic activity,Mexico,Mexico,41000.0,
...,...,...,...,...,...,...
124,2024,Volcanic activity,Philippines,La Castellana and Canlaon (Negros Oriental and...,29137.0,
125,2024,Volcanic activity,Indonesia,North Maluku province (west Halmahera Island),2011.0,
126,2024,Volcanic activity,Indonesia,Pululera,11511.0,
127,2024,Volcanic activity,Colombia,Antioquia department,335.0,


In [7]:
df['Magnitude'] = df['Magnitude'].apply(
    lambda x: round(np.random.uniform(0, 8), 1) if pd.isna(x) else x
)

In [8]:
df.isnull().sum()

Year              0
Disaster Type     0
Country           0
Location          2
Total Affected    9
Magnitude         0
dtype: int64

In [10]:
df['Location'].fillna(df['Location'].mode()[0], inplace=True)
df['Total Affected'].fillna(df['Total Affected'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Total Affected'].fillna(df['Total Affected'].mode()[0], inplace=True)


In [11]:
df.isnull().sum()

Year              0
Disaster Type     0
Country           0
Location          0
Total Affected    0
Magnitude         0
dtype: int64

In [12]:
df

Unnamed: 0,Year,Disaster Type,Country,Location,Total Affected,Magnitude
0,2000,Volcanic activity,Guatemala,El Caracol,800.0,1.3
1,2000,Volcanic activity,Philippines,Legazpi City area (Albay district,68426.0,0.4
2,2000,Volcanic activity,Japan,Abutatyoo,12400.0,8.0
3,2000,Volcanic activity,Japan,Miyakemura district (Tookyoo province),4000.0,5.0
4,2000,Volcanic activity,Mexico,Mexico,41000.0,8.0
...,...,...,...,...,...,...
124,2024,Volcanic activity,Philippines,La Castellana and Canlaon (Negros Oriental and...,29137.0,7.8
125,2024,Volcanic activity,Indonesia,North Maluku province (west Halmahera Island),2011.0,1.6
126,2024,Volcanic activity,Indonesia,Pululera,11511.0,0.3
127,2024,Volcanic activity,Colombia,Antioquia department,335.0,7.3


In [13]:
df.to_csv("volcano_cleaned.csv", index=False)