In [None]:
cd ..

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

mpl.rcParams['figure.dpi'] = 300
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 20}

mpl.rc('font', **font)

In [None]:
# Read data
quake_data_df = pd.read_csv('Data/blended_quake_data.csv')
quake_data_df = quake_data_df[(quake_data_df['DEATHS'] < 100000) & (quake_data_df['EQ_PRIMARY'] > 3)]
quake_data_df['Deaths_Logged'] = np.log10(quake_data_df['DEATHS'] + 1)
quake_data_df['Pct_Pop_Fatalities'] = quake_data_df['DEATHS'] / quake_data_df['sum_density']

# Create our target variable
quake_data_df['Deaths_Logged_Category'] = quake_data_df['Deaths_Logged'].apply(lambda x: np.ceil(x))

# landslide_vars = quake_data_df.iloc[:, 16:25].columns
# print(landslide_vars)
# landslide_df = pd.melt(quake_data_df, 
#         id_vars=[col for col in quake_data_df if col not in landslide_vars], 
#         value_vars=landslide_vars,
#         var_name='Landslide_Cat').reset_index()
quake_data_df.dtypes

In [None]:
# landslide_df['Deaths_Logged'] = landslide_df['Deaths_Logged'].apply(lambda x: np.ceil(x))
# landslide_df = landslide_df[landslide_df['value'] > 0]
# landslide_df['Deaths_Logged'] = np.log10(landslide_df['DEATHS'] + 1)
# ldsl_max_cat = landslide_df.groupby([col for col in quake_data_df if col not in landslide_vars]).agg(largest_cat=('Landslide_Cat', 'max')).reset_index()
# ldsl_max_cat['HasDeath'] = np.where(ldsl_max_cat['DEATHS'] > 0, 1, 0)


### EDA

In [None]:
# Logged Death Toll Hist (Unrounded)
quake_data_df['Deaths_Logged'].hist(bins=25, figsize= (15, 12))

plt.ylabel('Number of EarthQuakes [Count]')
plt.xlabel('Death Toll Magnitude [log10(Deaths)]')
plt.title('Distribution of Earthquakes by their Death Tolls: 1980 - 2020')

# plt.savefig('Figures\Quake_Death_Hist_cont')

plt.show()

In [None]:
# Logged Death Toll Hist (0 Deaths mapped to zero, 1-10 -> 1, 11-100 -> 2, etc)

quake_data_df.groupby(['Deaths_Logged_Category']).agg({'DEATHS': 'count'}).rename({'DEATHS': 'Count'}, axis=1).reset_index() \
    .plot.bar(x='Deaths_Logged_Category', y='Count', figsize= (14, 14), rot=0)

plt.ylabel('Number of EarthQuakes [Count]')
plt.xlabel('Death Toll Magnitude [Ceil(log10(Deaths))]')
plt.title('Distribution of Earthquakes by their Death Tolls: 1980 - 2020')

for cat in range(quake_data_df['Deaths_Logged_Category'].nunique()):
    num = quake_data_df[quake_data_df['Deaths_Logged_Category'] == cat].shape[0]
    plt.text(cat - 0.25, num + 10, num)
# plt.savefig('Figures\Quake_Death_Hist')
plt.show()

In [None]:
# Magnitude boxplots by death toll magnitude

quake_data_df[['Deaths_Logged_Category', 'EQ_PRIMARY']].boxplot(by='Deaths_Logged_Category', figsize=(12, 10),
                                                               medianprops = dict(linewidth=4.0, color='black'))
plt.xlabel('Magnitude of Death Toll [Ceil(log10(Deaths))]')
plt.ylabel('Earthquake Magnitude [Richter Scale]')
plt.title('Earthquake Magnitude Distribution by Death Toll Magnitude')
# plt.savefig('Figures\Mag_Avg_By_Death_Scale')
plt.show()

In [None]:
# HDI boxplots by death toll magnitude

quake_data_df[['Deaths_Logged_Category', 'HDI']].boxplot(by='Deaths_Logged_Category', figsize=(12, 10),
                                                               medianprops = dict(linewidth=4.0, color='black'))
plt.xlabel('Magnitude of Death Toll [Ceil(log10(Deaths))]')
plt.ylabel('HDI')
plt.title('HDI Distribution by Death Toll Magnitude')
# plt.savefig('Figures\HDI_Avg_By_Death_Scale')
plt.show()

In [None]:
# Scatterplots of Deaths and Magnitude with Population and HDI Colors 

cmap = plt.cm.rainbow

norm = mpl.colors.Normalize(vmin=np.log10(quake_data_df.sum_density.min()), vmax=np.log10(quake_data_df.sum_density.max()))

fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(quake_data_df['EQ_PRIMARY'], 
            quake_data_df['Deaths_Logged'],
            color=cmap(norm(np.log10(quake_data_df.sum_density.values))), s=60)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
fig.colorbar(sm, label='Population Density [log10 Quantity]')
plt.title('Magnitude vs Deaths by Population Density')
plt.ylabel('Death Toll [log10 Quantity]')
plt.xlabel('Magnitude [Richter Scale]')
# plt.savefig('Figures\Mag_Vs_Deaths_wrt_Pop')
plt.show()

norm = mpl.colors.Normalize(vmin=quake_data_df.HDI.min(), vmax=quake_data_df.HDI.max())

fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(quake_data_df['EQ_PRIMARY'], 
            quake_data_df['Deaths_Logged'],
            color=cmap(norm(quake_data_df.HDI.values)))

sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
fig.colorbar(sm, label='Local HDI')
plt.title('Magnitude vs Deaths by HDI')
plt.ylabel('Death Toll [log10 Quantity]')
plt.xlabel('Magnitude [Richter Scale]')
# plt.savefig('Figures\Mag_Vs_Deaths_wrt_hdi')
plt.show()


In [None]:
# Another view of the population-deaths-magnitude scatter plot. Take the median population of all quakes that have the 
# same magnitude and death toll (ie, all points stacked on each other)

by_death_mag_df = quake_data_df.groupby(['EQ_PRIMARY', 'Deaths_Logged']).agg(med_pop=('sum_density', 'median')).reset_index()
cmap = plt.cm.rainbow
norm = mpl.colors.Normalize(vmin=np.log10(by_death_mag_df.med_pop.min()), vmax=np.log10(by_death_mag_df.med_pop.max()))


fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(by_death_mag_df['EQ_PRIMARY'], 
            by_death_mag_df['Deaths_Logged'],
            color=cmap(norm(np.log10(by_death_mag_df.med_pop.values))), s=60)

plt.title('Magnitude vs Deaths by Population Density')
plt.ylabel('Death Toll [log10 Quantity]')
plt.xlabel('Magnitude [Richter Scale]')

sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
fig.colorbar(sm, label='Median Logged Population')
# plt.savefig('Figures\Mag_Vs_Deaths_wrt_pop_no_point_stack')
plt.show()

### Feature Importance

In [None]:
# Target variable is continuous, so we use f_regression and mutual_info_regression

top_n = 10
feature_cols = quake_data_df.copy()
feature_cols.drop(['Deaths_Logged_Category', 'Pct_Pop_Fatalities', 'LATITUDE', 'LONGITUDE', 
                   'INTENSITY', 'FLAG_TSUNAMI', 'COUNTRY', 'Unnamed: 0', 'I_D',
                   'DEATHS', 'Deaths_Logged', 'REGION_CODE'], axis=1, inplace=True)
target = quake_data_df['Deaths_Logged']


# Linear correlations

# Calculate f-score
f_scores, p_values = f_regression(X=feature_cols, y=target)

# Find 5 most important features
lin_f_selector = SelectKBest(f_regression, k=top_n)
best_lin_fs = lin_f_selector.fit_transform(X=feature_cols, y=target)
lin_feature_mask = lin_f_selector.get_support()
lin_feature_labels = lin_f_selector.get_feature_names_out()
lin_feature_data = pd.Series(f_scores[lin_feature_mask], index=lin_feature_labels).sort_values(ascending=False)

# Plot top 5 features
plt.barh(y=lin_feature_data.index, width=lin_feature_data)
plt.title(f'Top {top_n} Most Important Features Based on Linear Correlation')
plt.xlabel('F-Statistic')
plt.ylabel('Feature Name')
plt.show()


# Non-linear correlations

# Calculate mutual information
mi = mutual_info_regression(X=feature_cols, y=target, random_state=112)

# Find 5 most important features
nonlin_f_selector = SelectKBest(mutual_info_regression, k=top_n)
best_nonlin_fs = nonlin_f_selector.fit_transform(X=feature_cols, y=target)
nonlin_feature_mask = nonlin_f_selector.get_support()
nonlin_feature_labels = nonlin_f_selector.get_feature_names_out()
nonlin_feature_data = pd.Series(mi[nonlin_feature_mask], index=nonlin_feature_labels).sort_values(ascending=False)


# Plot top n features
plt.barh(y=nonlin_feature_data.index, width=nonlin_feature_data)
plt.title(f'Top {top_n} Most Important Features Based on NonLinear Correlation')
plt.xlabel('Mutual Information Measure')
plt.ylabel('Feature Name')
plt.show()


### Correlation Matrix

In [None]:
temp_df = quake_data_df.copy()
temp_df.drop(['LandUseCode', 'I_D', 'Unnamed: 0', 'REGION_CODE', 'MONTH', 'dist_to_land_code',
             'LATITUDE', 'LONGITUDE', 'INTENSITY', 'YEAR'], axis=1, inplace=True)
cor_type = 'spearman'
correl_matrix = temp_df.corr(method=cor_type)
fig, ax = plt.subplots()
fig.set_size_inches(25, 25)
plt.pcolormesh(correl_matrix, vmin=-1, vmax=1, cmap='seismic')
plt.yticks(np.arange(0.5, len(correl_matrix.index), 1), correl_matrix.index)
plt.xticks(np.arange(0.5, len(correl_matrix.columns), 1), correl_matrix.columns, rotation='vertical')

# Credit to stackoverflow for this method of adding text captions: 
# https://stackoverflow.com/questions/11917547/how-to-annotate-heatmap-with-text-in-matplotlib
for y in range(correl_matrix.shape[0]):
    for x in range(correl_matrix.shape[1]):
        if correl_matrix.iloc[y, x] < -0.35 or correl_matrix.iloc[y, x] >= 0.89:
            plt.text(x + 0.5, y + 0.5, '%.2f' % correl_matrix.iloc[y, x],
                     horizontalalignment='center',
                     verticalalignment='center', c='white')
        else:
            plt.text(x + 0.5, y + 0.5, '%.2f' % correl_matrix.iloc[y, x],
                 horizontalalignment='center',
                 verticalalignment='center')

plt.colorbar(label=f'{cor_type} Correlation Coefficient')
plt.title('Correlation Matrix of All Features in Quake Dataset')
# plt.savefig(f'Figures\Feature_Correlation_Matrix_{cor_type}')
plt.show()

### Splitting

In [None]:
def basic_split(X, y, train_size, val_size, test_size, random_state, stratify: bool = False):
    
    # test the inputs
    if sum([train_size, val_size, test_size]) != 1:
        raise ValueError('Train, test, and validation proportions do not equal 1!')
    
    if type(random_state) != int:
        raise ValueError('random_state is not an integer!')
        
    if not stratify:
    # perform basic split
        X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=train_size, random_state=random_state)

        rem_size = 1 - train_size
        new_test_size = test_size / rem_size

        X_test, X_val, y_test, y_val = train_test_split(X_other, y_other, train_size=new_test_size, random_state=random_state)
    
    else:
        # Stratify Split
        X_train, X_other, y_train, y_other = train_test_split(X, y, train_size=train_size, 
                                                              random_state=random_state,
                                                              stratify=y)

        rem_size = 1 - train_size
        new_test_size = test_size / rem_size

        X_test, X_val, y_test, y_val = train_test_split(X_other, y_other, train_size=new_test_size, 
                                                        random_state=random_state,
                                                        stratify=y_other)
        
    # test the outputs
    if np.around(X_train.shape[0] / X.shape[0], 2) != train_size:
        raise ValueError('Unexpected data quantity for training set!')
    if np.around(X_test.shape[0] / X.shape[0], 2) != test_size:
        raise ValueError('Unexpected data quantity for testing set!')
    if np.around(X_val.shape[0] / X.shape[0], 2) != val_size:
        raise ValueError('Unexpected data quantity for validation set!')
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
# Specify features and target variable

feature_matrix = quake_data_df[['REGION_CODE', 'COUNTRY', 'YEAR', 'MONTH', 'LATITUDE', 'LONGITUDE', 
                                'FOCAL_DEPTH', 'EQ_PRIMARY', 'avg_pop_distance', 'HDI', 'sum_density', 
                                'dist_to_closest_hdi', 'unlogged_mag_val', 'avg_hdi_dist', 'FLAG_TSUNAMI']].copy()
feature_matrix['FLAG_TSUNAMI'] = np.where(feature_matrix['FLAG_TSUNAMI'] == 'Yes', 1, 0)
target_var = quake_data_df.iloc[:, -1]

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = basic_split(X=feature_matrix, y=target_var,
                                                             train_size=0.6,
                                                             test_size=0.2,
                                                             val_size=0.2,
                                                             random_state=112,
                                                             stratify=True)

In [None]:
# Confirm Stratification
print(f'Val classes: \n{y_val.value_counts()}')
print(f'\nTest classes: \n{y_test.value_counts()}')
print(f'\nTrain classes: \n{y_train.value_counts()}')


### Preprocessing

In [None]:
onehot_ftrs = ['REGION_CODE', 'COUNTRY']
std_ftrs = ['YEAR', 'MONTH', 'LATITUDE', 'LONGITUDE', 'sum_density', 'FOCAL_DEPTH', 
            'EQ_PRIMARY', 'avg_pop_distance', 'HDI', 'dist_to_closest_hdi', 'unlogged_mag_val', 'avg_hdi_dist']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), onehot_ftrs),
        ('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)])

X_train_prep = np.hstack((clf.fit_transform(X_train), pd.DataFrame(X_train['FLAG_TSUNAMI'])))
X_val_prep = np.hstack((clf.transform(X_val), pd.DataFrame(X_val['FLAG_TSUNAMI'])))
X_test_prep = np.hstack((clf.transform(X_test), pd.DataFrame(X_test['FLAG_TSUNAMI'])))
