# Imports

In [None]:
# install conda (this restarts the colab kernel -- wait until restart before running subsequent blocks)

!pip install condacolab
import condacolab
condacolab.install_miniconda()

In [None]:
!pip install geopandas
!pip install hyperopt

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import xgboost
from pathlib import Path
from functools import reduce

# from google.colab import drive
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier

# Data Setup

In [None]:
drive.mount("/content/drive/")

In [None]:
df = pd.read_pickle("/content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/xBD_data/EFs/df_points_posthurr_flood_risk_storm_surge_soil_properties.pkl")

In [8]:
### TODO: replace when putting on drive
google_drive_personal_key = '/Users/orlandotimmerman/Library/CloudStorage/GoogleDrive-rt582@cam.ac.uk/.shortcut-targets-by-id/132Xl9yWOGKPM7ybLH0oa9c3dJGYrXkjC/'
# noaa six-hourly closest to each xbd point
df_noaa_xbd_pkl_path = google_drive_personal_key + 'datasets/EFs/weather_data/xbd_obs_noaa_six_hourly.pkl'
# xbd observation points
df_xbd_points_path = google_drive_personal_key + 'datasets/xBD_data/xbd_points_posthurr_reformatted.pkl'
# topographic (flood and storm surge risk, soil properties)
df_topographic_efs_path = google_drive_personal_key + 'datasets/processed_data/df_points_posthurr_flood_risk_storm_surge_soil_properties.pkl'
# terrain efs
df_terrain_efs_path = google_drive_personal_key + 'datasets/processed_data/Terrian_EFs.pkl'

In [14]:
def check_files_in_list_exist(
	file_list: list[str] | list[Path]
	):
	"""State which files don't exist and remove from list"""
	files_found = []
	for fl in file_list:
		# attempt conversion to Path object if necessary
		if type(fl) != Path:
			try:
				fl = Path(fl)
			except TypeError:
				print(f'{fl} could not be converted to Path object')
		
		if fl.is_file():
			files_found += fl,
		else:
			print(f'{fl} not found. Removing from list.')

	return files_found


def read_and_merge_pkls(
	pkl_paths: list[str] | list[Path]
) -> pd.DataFrame:
	"""Read in pkl files from list of file paths and merge on index"""
	# check all files exist
	pkl_paths_present = check_files_in_list_exist(pkl_paths)
	df_list = [pd.read_pickle(pkl) for pkl in pkl_paths_present]

	return reduce(lambda df1,df2: pd.merge(df1,df2,left_index=True,right_index=True), df_list)

In [21]:
pkl_paths = [df_noaa_xbd_pkl_path, df_xbd_points_path, df_topographic_efs_path, df_terrain_efs_path]
df_merged = read_and_merge_pkls(pkl_paths)

In [37]:
df_merged.columns

Index(['xbd_obs_geometry', 'damage_class_x', 'disaster_name_x',
       'xbd_capture_date', 'xbd_obs_lon', 'xbd_obs_lat', 'event_start',
       'event_end', 'closest_stations', 'stations_lat_lons', 'noaa_index',
       'tag', 'name', 'num_entries', 'noaa_obs_date', 'record_id',
       'sys_status', 'noaa_obs_lat', 'noaa_obs_lon', 'max_sust_wind', 'min_p',
       'r_ne_34', 'r_se_34', 'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50',
       'r_nw_50', 'r_sw_50', 'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64',
       'r_max_wind', 'strength', 'noaa_obs_geometry',
       'shortest_distance_to_track', 'geometry_x', 'damage_class_y',
       'disaster_name_y', 'capture_date_y', 'lon_x', 'lat_x', 'geometry_y',
       'damage_class', 'disaster_name', 'capture_date', 'lat_y', 'lon_y',
       'flood_risk', 'storm_surge', 'soil_density', 'sand_content',
       'clay_content', 'silt_content', 'latitude', 'longitude', 'geometry',
       'elevation', 'slope', 'aspect', 'dis2coast'],
      dtype='object')

In [39]:
def rename_and_drop_duplicated_cols(
    df: pd.DataFrame
) -> pd.DataFrame:
    """Drop columns which are copies of others and rename the 'asdf_x' headers which would have resulted"""
    # need to ensure no bad types first
    df = drop_cols_containing_lists(df)
    # remove duplicated columns
    dropped_df = df.T.drop_duplicates().T
    # rename columns for clarity (especially those which are shared between dfs). Will be able to remove most with better
    # column naming further up the process
    new_col_names = {col: col.replace('_x', '') for col in dropped_df.columns if col.endswith('_x')}
    
    return dropped_df.rename(columns=new_col_names)


def drop_cols_containing_lists(
    df: pd.DataFrame
) -> pd.DataFrame:
    """It seemed like the best solution at the time: and to be fair, I can't really think of better...
    N.B. for speed, only looks at values in first row – if there is a multi-type column, this would be the least of
    our worries...
    """
    df = df.loc[:, df.iloc[0].apply(lambda x: type(x) != list)]    
    return df


In [43]:
out = rename_and_drop_duplicated_cols(df_merged)

TypeError: unhashable type: 'list'

In [32]:

# remove cols containing lists (closest weather stations)
dropped_df = df_merged.drop(columns=['closest_stations','stations_lat_lons'],axis=1)
# remove duplicated columns
dropped_df = dropped_df.T.drop_duplicates().T
# rename columns for clarity (especially those which are shared between dfs). Will be able to remove most with better
# column naming further up the process
new_col_names = {col: col.replace('_x', '') for col in dropped_df.columns if col.endswith('_x')}
df = dropped_df.rename(columns=new_col_names)


# cols_name_mapping = {
#     'capture_date_x': 'xbd_capture_date'
# }
# df_merged.rename(columns=cols_name_mapping,inplace=True)

In [34]:
df.columns

Index(['xbd_obs_geometry', 'damage_class', 'disaster_name', 'xbd_capture_date',
       'xbd_obs_lon', 'xbd_obs_lat', 'event_start', 'event_end', 'noaa_index',
       'tag', 'num_entries', 'noaa_obs_date', 'record_id', 'sys_status',
       'noaa_obs_lat', 'noaa_obs_lon', 'max_sust_wind', 'min_p', 'r_ne_34',
       'r_se_34', 'r_nw_34', 'r_sw_34', 'r_ne_50', 'r_se_50', 'r_nw_50',
       'r_sw_50', 'r_ne_64', 'r_se_64', 'r_nw_64', 'r_sw_64', 'r_max_wind',
       'strength', 'noaa_obs_geometry', 'shortest_distance_to_track',
       'disaster_name', 'flood_risk', 'storm_surge', 'soil_density',
       'sand_content', 'clay_content', 'silt_content', 'elevation', 'slope',
       'aspect', 'dis2coast'],
      dtype='object')

In [None]:
df["y"] = df["damage_class"]
df["y"] = df["y"].astype(int)

In [None]:
# run this if you want to do binary classification
df.loc[df["y"] > 0, "y"] = 1

In [None]:
def replace_with_mean(df: pd.core.frame.DataFrame, column: str):
    df.loc[df[column] == 0, column] = df[column][df[column] > 0].mean()

replace_with_mean(df, "soil_density")
replace_with_mean(df, "sand_content")
replace_with_mean(df, "clay_content")
replace_with_mean(df, "silt_content")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df[["storm_surge", "flood_risk", "soil_density", "sand_content", "clay_content", 
                                                        "silt_content"]],
                                                    df["y"], test_size=0.25, random_state=1)

# Add weather data

In [None]:
df_weather = pd.read_pickle("/content/drive/MyDrive/ai4er/python/hurricane/hurricane-harm-herald/data/xBD_data/weather_data/xbd_obs_stations.pkl")

In [None]:
df_weather

# Logistic Regression Model

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
model.score(x_test, y_test)

In [None]:
importance = model.coef_[0]
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Random forest hyperparameter tuning

In [None]:
# this section is Work In Progress. 

from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = model.score(x_train, y_train)

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

In [None]:
best["criterion"] = "entropy"

In [None]:
model = RandomForestClassifier(**best)
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)
model.score(x_test, y_test)

In [None]:
importance = best.feature_importances_
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Random Forest Model

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
model.score(x_test, y_test)

In [None]:
importance = model.feature_importances_
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# XGBoost Model

In [None]:
model = XGBClassifier()

In [None]:
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
model.score(x_test, y_test)

In [None]:
importance = model.feature_importances_
display(importance)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, predictions)

In [None]:
sns.heatmap(confusion_matrix/np.sum(confusion_matrix), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()