In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error,log_loss
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm import tqdm_notebook
import geopandas as gpd

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [74]:
DATA_FOLDER = './input_files/'
OUTPUT_DATA_FOLDER = './output_files/'
OUTPUT_FILE_PATH = './output_files/blend_fadhloun*0.6_our_4_models_*0.4.csv'

In [75]:
sub_7189 = pd.read_csv(OUTPUT_DATA_FOLDER+'0.07189Sub.csv')['target_2019']

In [76]:

df = pd.read_csv(DATA_FOLDER + "Train.csv")
sub_df = pd.read_csv(DATA_FOLDER + "SampleSubmission.csv")
wetlands = pd.read_csv(DATA_FOLDER + 'wetlands.csv')

In [77]:
precip_cols = [c for c in df.columns if 'precip' in c]
len(precip_cols), len(precip_cols)//2

(34, 17)

In [78]:
precip_cols_2015 = precip_cols[0: 17]
precip_cols_2019 = precip_cols[17: 34]
train = df[precip_cols_2015]
train.columns = [f'precip_week_{i}' for i in range(1, 18)]
test = df[precip_cols_2019]
test.columns = [f'precip_week_{i}' for i in range(1, 18)]
train = pd.concat([train, df[['X', 'Y', 'target_2015', 'elevation', 'LC_Type1_mode', 'Square_ID']]], axis=1)
test = pd.concat([test, df[['X', 'Y', 'target_2015', 'elevation', 'LC_Type1_mode', 'Square_ID']]], axis=1)

In [79]:
precip_cols = [c for c in train.columns if 'precip' in c]
precip_cols

['precip_week_1',
 'precip_week_2',
 'precip_week_3',
 'precip_week_4',
 'precip_week_5',
 'precip_week_6',
 'precip_week_7',
 'precip_week_8',
 'precip_week_9',
 'precip_week_10',
 'precip_week_11',
 'precip_week_12',
 'precip_week_13',
 'precip_week_14',
 'precip_week_15',
 'precip_week_16',
 'precip_week_17']

In [80]:
train['wetland_dist'] = wetlands['wetland_dist']

In [81]:
train['total_precip'] = train[precip_cols[:]].sum(axis=1)
train['total_precip'].describe()

count    16466.000000
mean       248.931844
std         35.757286
min        194.198633
25%        231.024118
50%        242.589218
75%        255.214224
max        393.047856
Name: total_precip, dtype: float64

In [82]:
test['total_precip'] = test[precip_cols[:]].sum(axis=1)
test['total_precip'].describe()

count    16466.000000
mean       213.497496
std         27.206368
min        175.362350
25%        199.321592
50%        207.612676
75%        220.583010
max        323.796194
Name: total_precip, dtype: float64

In [83]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.X, df.Y))

In [84]:
df = pd.concat([train, test]).reset_index(drop=True)
df.shape

(32932, 25)

In [85]:
gdf = gpd.GeoDataFrame(
    test, geometry=gpd.points_from_xy(test.X, test.Y, test.elevation))
gdf['target_2015'] = train['target_2015']
gdf['best_sub'] = sub_7189
gdf['precip'] = test['total_precip']

In [86]:
train['elevation'].describe()
100 * (train['elevation'] < 55).sum()/train.shape[0]
### We see only 2% of the total data has less elevation than 55 metres
### Why do we chose 55 ??? simple because the minimum elevation is 55, so we want to go just 10 metres more than the minimum elevation

count    16466.000000
mean       592.848206
std        354.790357
min         45.541444
25%        329.063852
50%        623.000000
75%        751.434813
max       2803.303645
Name: elevation, dtype: float64

1.9919834811125956

In [87]:
### Lets filter our data and see how the floods for 2015 behaved and how our model is underprecting the floods.
### Low lying areas, are obviosuly more susceptible to be floods, and if these areas were near wetlands, we can almost guarantee
### that huge floods will hit that area.
### In the 2015 floods, the areas that were completely flooded (1), had a wetland distance of 0, and were elevated less than
### 10 metres more than the minimum elevation.
#### We increase their flooding.
#### This is manual thresholding, and definitely a model could do better, but we had not time to explore the model at the end

sel = (train['wetland_dist'] == 0) & (train['target_2015'] == 1) & (train['elevation'] < 55)
train[sel]['target_2015'].describe()
gdf[sel]['best_sub'].describe()
gdf.loc[(sel) & (gdf['best_sub'] < 0.9), 'best_sub'] = 0.9
gdf.loc[(sel) & (gdf['best_sub'] > 0.9), 'best_sub'] = 0.95
gdf[sel]['best_sub'].describe()

count    71.0
mean      1.0
std       0.0
min       1.0
25%       1.0
50%       1.0
75%       1.0
max       1.0
Name: target_2015, dtype: float64

count    71.000000
mean      0.894322
std       0.032087
min       0.812437
25%       0.877716
50%       0.897927
75%       0.917281
max       0.942641
Name: best_sub, dtype: float64

count    71.000000
mean      0.923239
std       0.025115
min       0.900000
25%       0.900000
50%       0.900000
75%       0.950000
max       0.950000
Name: best_sub, dtype: float64

In [88]:
sel = (train['wetland_dist'] == 0) & (train['target_2015'] > 0.9) & (train['target_2015'] < 1) & (train['elevation'] < 55)
train[sel]['target_2015'].describe()
gdf[sel]['best_sub'].describe()
gdf.loc[sel & (gdf['best_sub'] < 0.8), 'best_sub'] = 0.8
gdf[sel]['best_sub'].describe()

count    43.000000
mean      0.957711
std       0.032596
min       0.900114
25%       0.931220
50%       0.965133
75%       0.989255
max       0.998703
Name: target_2015, dtype: float64

count    43.000000
mean      0.805709
std       0.081186
min       0.620614
25%       0.758401
50%       0.820981
75%       0.869966
max       0.942417
Name: best_sub, dtype: float64

count    43.000000
mean      0.836501
std       0.042277
min       0.800000
25%       0.800000
50%       0.820981
75%       0.869966
max       0.942417
Name: best_sub, dtype: float64

In [89]:
sub_df = pd.read_csv(OUTPUT_DATA_FOLDER+'0.07189Sub.csv')
sub_df['target_2019'].describe()
sub_df['target_2019'] = gdf['best_sub']
sub_df['target_2019'].describe()

count    16466.000000
mean         0.039105
std          0.141375
min          0.000002
25%          0.000720
50%          0.003500
75%          0.013732
max          0.947108
Name: target_2019, dtype: float64

count    16466.000000
mean         0.039310
std          0.142533
min          0.000002
25%          0.000720
50%          0.003500
75%          0.013732
max          0.950000
Name: target_2019, dtype: float64

In [90]:
sub_df.to_csv(OUTPUT_DATA_FOLDER+'rectifying_7189_sub_only_wetland_threshold_9.csv', index=False)

In [91]:
sub_df['target_2019'].value_counts()

0.900000    38
0.950000    33
0.800000    18
0.000629     5
0.000629     5
            ..
0.012647     1
0.004507     1
0.013777     1
0.022383     1
0.000904     1
Name: target_2019, Length: 16283, dtype: int64