<a href="https://colab.research.google.com/github/brendanfoo/predict_refugee_count/blob/main/Investigating_large_events.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# imports etc

In [None]:
# Imports
import pandas as pd
import geopandas as gpd
import random
import altair as alt
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
fullmerge = pd.read_csv('/content/drive/MyDrive/210RefugeeMoves/datasets/FULLMERGE.csv').iloc[:, 1:]

In [None]:
fullmerge.columns

Index(['ISO3', 'IDPs from Event', 'econ', 'gov', 'soc', 'cap', 'eco', 'exp',
       'food', 'hab', 'health', 'infra', 'sens', 'Area', 'Began', 'Ended',
       'MainCause', 'Severity', 'Duration', 'Magnitude', 'Population',
       'Population Density (People per Sq. Km.)',
       'Net international migrants, both sexes', 'Scaled_IDP'],
      dtype='object')

In [None]:
gidd = pd.read_csv('/content/drive/MyDrive/210RefugeeMoves/colab/UNHCRdata.csv', encoding='ISO-8859-1')

In [None]:
base = gidd[['ISO3', 'Date of Event (start)', 'Disaster Internal Displacements (Raw)']]
base = base.groupby(['ISO3', 'Date of Event (start)']).sum().reset_index()
base['Year'] = base['Date of Event (start)'].str[:4].astype(str).astype(int)
base2 = base.rename(columns = {'Disaster Internal Displacements (Raw)': 'IDPs from Event', 'Date of Event (start)': 'Date'})
giddy = base2.copy(deep = True)

In [None]:
floods = pd.read_csv('/content/drive/MyDrive/210RefugeeMoves/datasets/floods_with_ISO.csv').iloc[:, 1:]

# events above 3mil

In [None]:
fullmerge[fullmerge['IDPs from Event'] >= 3000000][['ISO3', 'IDPs from Event', 'Area', 'Began', 'Ended', 'MainCause']]

Unnamed: 0,ISO3,IDPs from Event,Area,Began,Ended,MainCause
463,CHN,3760000,251527,2020-06-15,2020-07-05,Monsoonal Rain
464,CHN,3760000,154387,2020-06-29,2020-07-03,Torrential Rain
465,CHN,3760000,107563,2020-06-27,2020-07-30,Monsoonal Rain and Dam release
1707,PHL,4095280,32477,2013-11-08,2013-11-19,Tropical Storm Haiyan/Yolanda


1 triplet of year/month pair

In [None]:
giddy[giddy['IDPs from Event'] >= 3000000][giddy['ISO3'] == 'CHN']

  giddy[giddy['IDPs from Event'] >= 3000000][giddy['ISO3'] == 'CHN']


Unnamed: 0,ISO3,Date,IDPs from Event,Year
1338,CHN,2008-12-05,15000000,2008
1339,CHN,2009-01-01,4030507,2009
1340,CHN,2010-01-01,15920060,2010
1341,CHN,2011-01-01,4489545,2011
1342,CHN,2012-01-01,5730800,2012
1658,CHN,2020-06-01,3760000,2020


In [None]:
floods[floods['ISO3'] == 'CHN'][floods['Year Month'] == '2020-06']

  floods[floods['ISO3'] == 'CHN'][floods['Year Month'] == '2020-06']


Unnamed: 0,Country,Area,Began,Ended,MainCause,Severity,ISO3,Year Month
1677,China,251527,2020-06-15,2020-07-05,Monsoonal Rain,2,CHN,2020-06
1685,China,154387,2020-06-29,2020-07-03,Torrential Rain,2,CHN,2020-06
1686,China,107563,2020-06-27,2020-07-30,Monsoonal Rain and Dam release,2,CHN,2020-06


#events above 2 mil (below 3 mil)

In [None]:
fullmerge[fullmerge['IDPs from Event'] >= 2000000][fullmerge['IDPs from Event'] <= 3000000][['ISO3', 'IDPs from Event', 'Area', 'Began', 'Ended', 'MainCause']].sort_values('IDPs from Event')

  fullmerge[fullmerge['IDPs from Event'] >= 2000000][fullmerge['IDPs from Event'] <= 3000000][['ISO3', 'IDPs from Event', 'Area', 'Began', 'Ended', 'MainCause']].sort_values('IDPs from Event')


Unnamed: 0,ISO3,IDPs from Event,Area,Began,Ended,MainCause
1691,PHL,2062402,17181,2009-01-20,2009-01-27,Heavy Rain
1036,IND,2100000,75488,2008-08-05,2008-08-11,Heavy monsoon rains
1037,IND,2100000,80590,2008-08-30,2008-09-08,Monsoon Rains
381,CHN,2166500,551385,2017-06-22,2017-07-03,Monsoonal Rain
1173,IND,2225340,345643,2021-05-25,2021-05-28,Tropical Storm Yaas and Storm Surge
1174,IND,2225340,331180,2021-05-16,2021-05-23,Tropical Storm Tauktae
1038,IND,2400000,464318,2008-09-22,2008-09-29,Dam release and Heavy Rain
1121,IND,2441213,171282,2020-05-19,2020-05-20,Tropical Storm Amphan
1122,IND,2441213,37967,2020-05-23,2020-06-03,Tropical Storm Amphan
1692,PHL,2499241,10774,2011-01-01,2011-01-18,Heavy Rain


4 pairs of repeat month/years

# between 1 and 2 mil

In [None]:
fullmerge[fullmerge['IDPs from Event'] >= 1000000][fullmerge['IDPs from Event'] <= 2000000][['ISO3', 'IDPs from Event', 'Area', 'Began', 'Ended', 'MainCause']].sort_values('IDPs from Event')

  fullmerge[fullmerge['IDPs from Event'] >= 1000000][fullmerge['IDPs from Event'] <= 2000000][['ISO3', 'IDPs from Event', 'Area', 'Began', 'Ended', 'MainCause']].sort_values('IDPs from Event')


Unnamed: 0,ISO3,IDPs from Event,Area,Began,Ended,MainCause
1043,IND,1000000,102661,2013-10-12,2013-11-10,Heavy Rain
1042,IND,1000000,246385,2013-10-12,2013-10-14,Tropical Storm Phailin
1040,IND,1042271,152700,2013-06-11,2013-06-18,Monsoonal Rain
1041,IND,1042271,38019,2013-06-23,2013-07-15,Monsoonal Rain
1039,IND,1042271,131743,2013-06-12,2013-06-27,Monsoonal Rain
1046,IND,1073673,189576,2014-07-20,2014-07-29,Monsoonal Rain
150,BGD,1100000,72848,2013-05-14,2013-05-16,Tropical Storm Mahasen
1075,IND,1200000,629962,2015-07-15,2015-08-19,Monsoonal Rain and Tropical Storm K
1765,PHL,1250133,84651,2020-10-24,2020-11-02,Tropical Storm Molave
445,CHN,1298000,470032,2019-06-07,2019-06-10,Monsoonal Rain


5 sets of repeat entries

# manually merging repeats

- group by rows with matching values in ISO, Year-Month, and IDPs
>- for these groups, keep all rows, but reduce IDP values by dividing the value by the amount of repeats (or by area proportion)

Can't combine them all to one row because all aspects of magnitude differ greatly, so each row from flood archive is clearly a different occassion, that presumeably is part of the larger event recorded by GID


This seems to be another artifact of flood data in general not being very well recorded, and certainly not in a standardized way. It's hard to analyze flood data when everyone gathers a different incomplete set of information with different definitions of what constitutes a "flood"

###testing with only rows over a million

In [None]:
overmil = fullmerge[fullmerge['IDPs from Event'] >= 1000000][['ISO3', 'IDPs from Event', 'Area', 'Began', 'Ended', 'MainCause']].sort_values('IDPs from Event')
overmil.shape # there are 10 sets of pseudo-duplicates here

(47, 6)

In [None]:
#pd.options.display.float_format = '{:.0f}'.format

overmil['Began'] = pd.to_datetime(overmil['Began'], errors='coerce')
overmil['Year_Month'] = overmil['Began'].dt.strftime('%Y-%m')

overmil['Total_Group_Area'] = overmil.groupby(['IDPs from Event', 'ISO3', 'Year_Month'])['Area'].transform('sum')
overmil['Scaled_IDP'] = overmil['Area'] * overmil['IDPs from Event'] / overmil['Total_Group_Area']
overmil

Unnamed: 0,ISO3,IDPs from Event,Area,Began,Ended,MainCause,Year_Month,Total_Group_Area,Scaled_IDP
1042,IND,1000000,246385,2013-10-12,2013-10-14,Tropical Storm Phailin,2013-10,349046,705881
1043,IND,1000000,102661,2013-10-12,2013-11-10,Heavy Rain,2013-10,349046,294119
1040,IND,1042271,152700,2013-06-11,2013-06-18,Monsoonal Rain,2013-06,322463,493561
1039,IND,1042271,131743,2013-06-12,2013-06-27,Monsoonal Rain,2013-06,322463,425823
1041,IND,1042271,38019,2013-06-23,2013-07-15,Monsoonal Rain,2013-06,322463,122887
1046,IND,1073673,189576,2014-07-20,2014-07-29,Monsoonal Rain,2014-07,189576,1073673
150,BGD,1100000,72848,2013-05-14,2013-05-16,Tropical Storm Mahasen,2013-05,72848,1100000
1075,IND,1200000,629962,2015-07-15,2015-08-19,Monsoonal Rain and Tropical Storm K,2015-07,629962,1200000
1765,PHL,1250133,84651,2020-10-24,2020-11-02,Tropical Storm Molave,2020-10,84651,1250133
445,CHN,1298000,470032,2019-06-07,2019-06-10,Monsoonal Rain,2019-06,470032,1298000


###doing it again but with full data

In [None]:

fullmerge_scaledy = fullmerge.copy(deep=True)[['ISO3', 'IDPs from Event', 'Area', 'Began']]

fullmerge_scaledy['Began'] = pd.to_datetime(fullmerge_scaledy['Began'], errors='coerce')
fullmerge_scaledy['Year_Month'] = fullmerge_scaledy['Began'].dt.strftime('%Y-%m')

fullmerge_scaledy['Total_Group_Area'] = fullmerge_scaledy.groupby(['IDPs from Event', 'ISO3', 'Year_Month'])['Area'].transform('sum')
fullmerge_scaledy['Scaled_IDP'] = fullmerge_scaledy['Area'] * fullmerge_scaledy['IDPs from Event'] / fullmerge_scaledy['Total_Group_Area']

fullmerge['Scaled_IDP'] = np.ceil(fullmerge_scaledy['Scaled_IDP'])
fullmerge[['ISO3', 'IDPs from Event', 'Began', 'Area', 'Scaled_IDP']].sort_values('IDPs from Event')

Unnamed: 0,ISO3,IDPs from Event,Began,Area,Scaled_IDP
2511,USA,1,2019-12-28,138727,1
1144,IND,1,2020-06-20,59427,1
1143,IND,1,2020-06-02,199451,1
650,GRC,1,2016-11-26,36417,1
892,IDN,2,2018-12-05,136440,2
...,...,...,...,...,...
1114,IND,2623349,2019-06-27,67023,2623349
463,CHN,3760000,2020-06-15,251527,1841837
464,CHN,3760000,2020-06-29,154387,1130520
465,CHN,3760000,2020-06-27,107563,787645
