# Imports

In [1]:
# install conda (this restarts the colab kernel -- wait until restart before running subsequent blocks)

!pip install condacolab
import condacolab
condacolab.install_miniconda()



RuntimeError: This module must ONLY run as part of a Colab notebook!

In [None]:
!pip install geopandas
!pip install hyperopt

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import xgboost
from pathlib import Path
from functools import reduce

# from google.colab import drive
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier

# Data Setup

In [3]:
drive.mount("/content/drive/")

NameError: name 'drive' is not defined

In [9]:
### TODO: replace when putting on drive
google_drive_personal_key = '/Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/'
# noaa six-hourly closest to each xbd point
df_noaa_xbd_pkl_path = google_drive_personal_key + 'datasets/EFs/weather_data/xbd_obs_noaa_six_hourly.pkl'
# xbd observation points
df_xbd_points_path = google_drive_personal_key + 'datasets/xBD_data/xbd_points_posthurr_reformatted.pkl'
# topographic (flood and storm surge risk, soil properties)
df_topographic_efs_path = google_drive_personal_key + 'datasets/processed_data/df_points_posthurr_flood_risk_storm_surge_soil_properties.pkl'
# terrain efs
df_terrain_efs_path = google_drive_personal_key + 'datasets/processed_data/Terrian_EFs.pkl'

In [64]:
def check_files_in_list_exist(
	file_list: list[str] | list[Path]
	):
	"""State which files don't exist and remove from list"""
	files_found = []
	for fl in file_list:
		# attempt conversion to Path object if necessary
		if type(fl) != Path:
			try:
				fl = Path(fl)
			except TypeError:
				print(f'{fl} could not be converted to Path object')
		
		if fl.is_file():
			files_found += fl,
		else:
			print(f'{fl} not found. Removing from list.')

	return files_found


def read_and_merge_pkls(
	pkl_paths: list[str] | list[Path]
) -> pd.DataFrame:
	"""Read in pkl files from list of file paths and merge on index"""
	# check all files exist
	pkl_paths_present = check_files_in_list_exist(pkl_paths)
	df_list = [pd.read_pickle(pkl) for pkl in pkl_paths_present]

	return reduce(lambda df1,df2: pd.merge(df1,df2,left_index=True,right_index=True), df_list)


def rename_and_drop_duplicated_cols(
    df: pd.DataFrame
) -> pd.DataFrame:
    """Drop columns which are copies of others and rename the 'asdf_x' headers which would have resulted"""
    # need to ensure no bad types first
    df = drop_cols_containing_lists(df)
    # remove duplicated columns
    dropped_df = df.T.drop_duplicates().T
    # rename columns for clarity (especially those which are shared between dfs). Will be able to remove most with better
    # column naming further up the process
    new_col_names = {col: col.replace('_x', '') for col in dropped_df.columns if col.endswith('_x')}
    
    return dropped_df.rename(columns=new_col_names)


def drop_cols_containing_lists(
    df: pd.DataFrame
) -> pd.DataFrame:
    """It seemed like the best solution at the time: and to be fair, I can't really think of better...
    N.B. for speed, only looks at values in first row – if there is a multi-type column, this would be the least of
    our worries...
    """
    df = df.loc[:, df.iloc[0].apply(lambda x: type(x) != list)]

    return df


def assign_predictor(
    df: pd.DataFrame,
    col_name: str,
    drop_classes: list[int],
    binary_classification: bool = True
) -> pd.DataFrame:
    """Assign column as predictor value, and choose whether binary or multi-class classification. Can choose to drop
    classes."""
    df["y"] = df[col_name].astype(int)

    if binary_classification:
        df.loc[df["y"] > 0, "y"] = 1

    # drop any classes in 
    df = df.loc[~df['y'].isin(drop_classes)]

    return df


def replace_cols_with_mean(
    df: pd.DataFrame, 
    col_names: list[str]
) -> pd.DataFrame:
    """Replace values in a column with the mean value"""
    for col in col_names:
        df.loc[df[col] == 0, col] = df[col][df[col] > 0].mean()

    return df


def train_test_display_model(
    df: pd.DataFrame,
    var_col_names: list[str],
    model_name: str = 'LogisticRegression',
    y_col: str = 'y',
    test_size: float = 0.25,
    random_state: int = 1
) -> list:
    """Specify columns in a df to use to train and test model. Currently available models: 'LogisticRegression', 
    'RandomForest'

    TODO: should I put this in a class?
    """

    x_train, x_test, y_train, y_test = train_test_split(
        df[var_col_names], df[y_col], test_size=test_size, random_state=random_state)

    # select chosen model
    if model_name == 'LogisticRegression':
        model = LogisticRegression()
        model = train_test_model(model, [x_train, y_train], [x_test, y_test])
        importance = model.coef_[0]
    elif model_name == 'RandomForest':
        model = RandomForestClassifier()
        model = train_test_model(model, [x_train, y_train], [x_test, y_test])
        importance = model.feature_importances_  

    predictions = model.predict(x_test)
    # TODO: plot nicely
    display(importance)
    plot_confusion_matrix(y_test, predictions)


def plot_confusion_matrix(
    y_test: list,
    predictions: list,
    score: float,
    ax=None
):
    """Plot confusion matrix from y_test and inferred values"""
    damage_labels = {0: 'undamaged', 1: 'minor damage', 2: 'major damage', 
                      3: 'destroyed', 4: 'unclassified'}

    confusion_matrix = metrics.confusion_matrix(y_test, predictions)
    # initialise axes if necessary
    ax = ax or plt.gca()
    sns.heatmap(confusion_matrix/np.sum(confusion_matrix), ax=ax, annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
    # formatting
    ax.set_ylabel('Actual label')
    ax.set_xlabel('Predicted label')
    # assign integer damage classes to labels
    xtick_labels = [damage_labels[i] for i in range(len(confusion_matrix))]
    ax.set_xticks(ax.get_xticks(),xtick_labels,rotation=45)
    ax.set_yticks(ax.get_yticks(),xtick_labels,rotation=45)
    ax.xaxis.set_label_position('top') 
    ax.xaxis.tick_top()

    if len(confusion_matrix) == 2:  # binary classification
      ax.set_title(f'Confusion matrix for binary classification \n Score: {score:.4f}')
    else: # multiclass classification
      ax.set_title(f'Confusion matrix for multiclass classification \n Score: {score:.4f}')

    return ax


def plot_importances(
    var_col_names: list[str],
    importances: list[float],
    ax=None
):
    """Visualise feature importance"""
    # initialise axes if necessary
    ax = ax or plt.gca()
    # TODO: add numbers onto bars
    ax.barh(var_col_names, importances)
    ax.set_ylabel('Input variable')
    ax.set_xlabel('Feature importance')
    ax.set_title('Feature importance for model')

    return ax


def train_test_model(
    model,
    trains: list[list],
    tests: list[list]
) -> list:
    """Train provided model. Trains in format [x_train, y_train]; similar with tests"""
    model.fit(trains[0], trains[1])
    predictions = model.predict(tests[0])
    model.score(tests[0], tests[1])

    return model

In [25]:
pkl_paths = [df_noaa_xbd_pkl_path, df_xbd_points_path, df_topographic_efs_path, df_terrain_efs_path]
df_merged = read_and_merge_pkls(pkl_paths)
df_merged.columns

In [33]:
out = rename_and_drop_duplicated_cols(df_merged)
out.columns

Index(['xbd_obs_geometry', 'damage_class', 'disaster_name', 'capture_date',
       'xbd_obs_lon', 'xbd_obs_lat', 'event_start', 'event_end',
       'stations_lat_lons', 'noaa_index', 'tag', 'num_entries',
       'noaa_obs_date', 'record_id', 'sys_status', 'noaa_obs_lat',
       'noaa_obs_lon', 'max_sust_wind', 'min_p', 'r_ne_34', 'r_se_34',
       'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50', 'r_nw_50', 'r_sw_50',
       'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64', 'strength',
       'noaa_obs_geometry', 'shortest_distance_to_track', 'disaster_name',
       'flood_risk', 'storm_surge', 'soil_density', 'sand_content',
       'clay_content', 'silt_content', 'elevation', 'slope', 'aspect',
       'dis2coast'],
      dtype='object')

In [40]:
# replace necessary columns with mean TODO: ask Ruari about this
cols_for_mean = ['soil_density','sand_content','clay_content','silt_content']
df_model_ready = replace_cols_with_mean(out,cols_for_mean)
df_model_ready

Unnamed: 0_level_0,xbd_obs_geometry,damage_class,disaster_name,capture_date,xbd_obs_lon,xbd_obs_lat,event_start,event_end,stations_lat_lons,noaa_index,...,flood_risk,storm_surge,soil_density,sand_content,clay_content,silt_content,elevation,slope,aspect,dis2coast
xbd_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,POINT (-77.9244320492178 34.78850199541164),2,FLORENCE,2018-09-20 16:04:41+00:00,-77.924432,34.788502,2018-09-17 16:32:47,2018-09-21 16:04:41,,51154,...,0.0,0,132,452,218,330,9,0.53033,135.0,44915.193168
1,POINT (-77.92458589472702 34.78817409635829),1,FLORENCE,2018-09-20 16:04:41+00:00,-77.924586,34.788174,2018-09-17 16:32:47,2018-09-21 16:04:41,,51154,...,0.0,0,132,452,218,330,7,0.559017,116.565048,44926.181407
2,POINT (-85.61007417082075 30.200042174373575),0,MICHAEL,2018-10-13 16:48:15+00:00,-85.610074,30.200042,2018-10-12 16:48:15,2018-10-14 16:48:15,,51448,...,0.0,0,130.97161,396.531,297.422501,306.068774,18,2.915476,30.963757,5134.550778
3,POINT (-85.61056875858309 30.20001239764311),0,MICHAEL,2018-10-13 16:48:15+00:00,-85.610569,30.200012,2018-10-12 16:48:15,2018-10-14 16:48:15,,51448,...,0.0,0,130.97161,396.531,297.422501,306.068774,15,4.257347,310.236359,5130.894941
4,POINT (-85.6105468715275 30.20060087544323),1,MICHAEL,2018-10-13 16:48:15+00:00,-85.610547,30.200601,2018-10-12 16:48:15,2018-10-14 16:48:15,,51448,...,0.0,0,130.97161,396.531,297.422501,306.068774,11,2.236068,296.565063,5066.617333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23049,POINT (-77.91796670865669 34.64623298093464),0,FLORENCE,2018-09-20 16:04:41+00:00,-77.917967,34.646233,2018-09-17 16:32:47,2018-09-21 16:04:41,,51154,...,0.0,0,133,430,235,335,8,0.25,0.0,37723.343619
23050,POINT (-77.91772594373008 34.64627829178596),0,FLORENCE,2018-09-20 16:04:41+00:00,-77.917726,34.646278,2018-09-17 16:32:47,2018-09-21 16:04:41,,51154,...,0.0,0,133,430,235,335,8,3.010399,221.633545,37713.90215
23051,POINT (-77.9176517767119 34.646497499155615),0,FLORENCE,2018-09-20 16:04:41+00:00,-77.917652,34.646497,2018-09-17 16:32:47,2018-09-21 16:04:41,,51154,...,0.0,0,133,430,235,335,8,3.010399,221.633545,37729.07407
23052,POINT (-77.91835147876859 34.64612284454794),0,FLORENCE,2018-09-20 16:04:41+00:00,-77.918351,34.646123,2018-09-17 16:32:47,2018-09-21 16:04:41,,51154,...,0.0,0,133,430,235,335,9,0.75,90.0,37735.137477


In [None]:
var_cols = ['max_sust_wind', 'min_p', 'r_ne_34', 'r_se_34',
       'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50', 'r_nw_50', 'r_sw_50',
       'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64', 'strength', 'shortest_distance_to_track',
       'flood_risk', 'storm_surge', 'soil_density', 'sand_content',
       'clay_content', 'silt_content', 'elevation', 'slope', 'aspect',
       'dis2coast']
    
train_test_model(df_model_ready,var_cols,model_name='LogisticRegression')

# Logistic Regression Model

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
model.score(x_test, y_test)

In [None]:
importance = model.coef_[0]
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Random forest hyperparameter tuning

In [None]:
# this section is Work In Progress. 

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = model.score(x_train, y_train)

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

In [None]:
best["criterion"] = "entropy"

In [None]:
model = RandomForestClassifier(**best)
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)
model.score(x_test, y_test)

In [None]:
importance = best.feature_importances_
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Random Forest Model

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
model.score(x_test, y_test)

In [None]:
importance = model.feature_importances_
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# XGBoost Model

In [None]:
model = XGBClassifier()

In [None]:
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
model.score(x_test, y_test)

In [None]:
importance = model.feature_importances_
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()