## Spatial temporal risk assessment to aid in planning for malaria elimination in Senegal

##### (Submission by Kristin, Arul, Jacqueline )                                                                

### Python code used in our analysis

### 1. Importing shape file and Geo-spatial visualization

In [None]:
%matplotlib inline 
# importing necessary libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt


In [None]:
import fiona
# opening the shape file
senegal = fiona.open("senegal_arr_2014_wgs.shp")
senegal = gpd.read_file("senegal_arr_2014_wgs.shp")

In [None]:
# checking for the information on the data
senegal.info()

In [None]:
senegal.head()

In [None]:
senegal.plot()

In [None]:
type(senegal)

In [None]:
len(senegal)

In [None]:
# Geo-spatial mapping of senegal region with color framework as jet
senegal.plot(cmap = 'jet')

In [None]:
# Geo-spatial mapping of senegal region with specific to region
senegal.plot(cmap = 'jet', column = 'REG')

In [None]:
# Geo-spatial mapping of senegal region with modified size
senegal.plot(cmap = 'jet', column = 'REG', figsize  = (10,10))

In [None]:
senegal.geometry

### 2. Mapping individual trajectories of 1000 individuals to determine travel patterns 

In [None]:
import pandas as pd
import skmob

In [None]:
# read the file that will be the trajectory data frame (tdf) - JANUARY
tdf_01 = skmob.TrajDataFrame.from_file('/Users/thewooz/Documents/DATA_WEBBASED/SET3/SET3_M01.CSV.gz',header=None)

# read the file that will be the trajectory data frame (tdf) - OCTOBER
tdf_10 = skmob.TrajDataFrame.from_file('/Users/thewooz/Documents/DATA_WEBBASED/SET3/SET3_M10.CSV.gz',header=None)

In [None]:
tdf_01.head()

In [None]:
# rename columns
newcols = {
    0: 'uid', 
    1: 'datetime', 
    2: 'arr_id'
}
tdf_01.rename(columns=newcols, inplace=True)
tdf_10.rename(columns=newcols, inplace=True)

In [None]:
# merge with the average latitude and longitude for each arrondissement

# read in the lat and long data
coord = pd.read_csv('/Users/thewooz/Documents/DATA_WEBBASED/ContextData/SITE_ARR_LONLAT.CSV', index_col=0)

# get the average lat and long per neighborhood
coord_mean = coord.groupby(['arr_id']).mean()

# merge tdf file with coordinates
tdf_01 = pd.merge(tdf_01, coord_mean, on='arr_id', how ='left')
tdf_10 = pd.merge(tdf_10, coord_mean, on='arr_id', how ='left')

In [None]:
tdf_10.head()

In [None]:
# rename columns
newcols = {
    'lon': 'lng'
}
tdf_01.rename(columns=newcols, inplace=True)
tdf_10.rename(columns=newcols, inplace=True)

In [None]:
tdf_10.head()

In [None]:
# create a trajectory map for 1000 people
map_f_01 = tdf_01.plot_trajectory(max_users=1000, hex_color='#000000',start_end_markers=False,opacity=0.25,weight=1)
map_f_10 = tdf_10.plot_trajectory(max_users=1000, hex_color='#000000',start_end_markers=False,opacity=0.25,weight=1)

In [None]:
map_f_01

In [None]:
map_f_10

### 3. Creating a dataframe needed to use "flow"

In [None]:
# Import pacakages
import pandas as pd
import numpy as np

In [None]:
# read in quarterly undup data - this data is unduplicated to one record per person, per day, per arrondisement
q1 = pd.read_pickle('q1_coord.pkl')
q2 = pd.read_pickle('q2_coord.pkl')
q3 = pd.read_pickle('q3_coord.pkl')
q4 = pd.read_pickle('q4_coord.pkl')

In [None]:
# look at data for q1
q1.head

In [None]:
# reduce the dataframes to just the needed data, user_id and arr_id
q1_reduced = q1.loc[:, ['user_id','arr_id']]
q2_reduced = q2.loc[:, ['user_id','arr_id']]
q3_reduced = q3.loc[:, ['user_id','arr_id']]
q4_reduced = q4.loc[:, ['user_id','arr_id']]

In [None]:
# obtain the mode arrondissement for each user in each quarter
# NOTE:  this takes way too long to process - changing to capture just the first arrondissement for each user_id

#q1_modearr = q1_reduced.groupby('user_id').apply(pd.DataFrame.mode).reset_index(drop=True)
#q2_modearr = q2_reduced.groupby('user_id').apply(pd.DataFrame.mode).reset_index(drop=True)
#q3_modearr = q3_reduced.groupby('user_id').apply(pd.DataFrame.mode).reset_index(drop=True)
#q4_modearr = q4_reduced.groupby('user_id').apply(pd.DataFrame.mode).reset_index(drop=True)


# unduplicate - one user_id per QUARTER, keeping only the first arrondisement per person
q1_firstarr = q1_reduced.reset_index().drop_duplicates(subset=['user_id'], keep='first').set_index('index')
q2_firstarr = q2_reduced.reset_index().drop_duplicates(subset=['user_id'], keep='first').set_index('index')
q3_firstarr = q3_reduced.reset_index().drop_duplicates(subset=['user_id'], keep='first').set_index('index')
q4_firstarr = q4_reduced.reset_index().drop_duplicates(subset=['user_id'], keep='first').set_index('index')

In [None]:
q1_firstarr.head()

In [None]:
# find the most populous arrondissements in order to find which ones to include in the analysis
q1_firstarr['arr_id'].value_counts().head(6)

In [None]:
# group each QUARTER by the arr_id
q1_gb = q1_firstarr.groupby('arr_id') 
q2_gb = q2_firstarr.groupby('arr_id') 
q3_gb = q3_firstarr.groupby('arr_id') 
q4_gb = q4_firstarr.groupby('arr_id') 

In [None]:
[q1_gb.get_group(x) for x in q1_gb.groups]

Unable to figure out how to loop through the data files in order to compare all arrondissements and quarters. Thefore, code below captures the number of people who moved from the most populous arrondissement (4) to the next most populous (3,7,25,2, and 6). Will manually create a dataframe that can be used in Scikit-Mobility Flow.

In [None]:
# q1->q2:  4->3 
sum(q1_gb.get_group(4)['user_id'].isin(q2_gb.get_group(3)['user_id']))

In [None]:
# q1->q2:  4->7 
sum(q1_gb.get_group(4)['user_id'].isin(q2_gb.get_group(7)['user_id']))

In [None]:
# q1->q2:  4->25
sum(q1_gb.get_group(4)['user_id'].isin(q2_gb.get_group(25)['user_id']))

In [None]:
# q1->q2:  4->2
sum(q1_gb.get_group(4)['user_id'].isin(q2_gb.get_group(2)['user_id']))

In [None]:
# q1->q2:  4->6
sum(q1_gb.get_group(4)['user_id'].isin(q2_gb.get_group(6)['user_id']))

In [None]:
# q2->q3:  4->3
sum(q2_gb.get_group(4)['user_id'].isin(q3_gb.get_group(3)['user_id']))

In [None]:
# q2->q3:  4->7
sum(q2_gb.get_group(4)['user_id'].isin(q3_gb.get_group(7)['user_id']))

In [None]:
# q2->q3:  4->25
sum(q2_gb.get_group(4)['user_id'].isin(q3_gb.get_group(25)['user_id']))

In [None]:
# q2->q3:  4->2
sum(q2_gb.get_group(4)['user_id'].isin(q3_gb.get_group(2)['user_id']))

In [None]:
# q2->q3:  4->6
sum(q2_gb.get_group(4)['user_id'].isin(q3_gb.get_group(6)['user_id']))

In [None]:
# q3->q4:  4->3
sum(q3_gb.get_group(4)['user_id'].isin(q4_gb.get_group(3)['user_id']))

In [None]:
# q3->q4:  4->7
sum(q3_gb.get_group(4)['user_id'].isin(q4_gb.get_group(7)['user_id']))

In [None]:
# q3->q4:  4->25
sum(q3_gb.get_group(4)['user_id'].isin(q4_gb.get_group(25)['user_id']))

In [None]:
# q3->q4:  4->2
sum(q3_gb.get_group(4)['user_id'].isin(q4_gb.get_group(2)['user_id']))

In [None]:
# q3->q4:  4->6
sum(q3_gb.get_group(4)['user_id'].isin(q4_gb.get_group(6)['user_id']))

In [None]:
# after getting the numbers above, calculate the total who made each journey (done manually)
# create a dataframe from these numbers

flowdf = pd.DataFrame(np.array([[4,3,6587], 
                                [4,7,1969], 
                                [4,25,320], 
                                [4,2,4957], 
                                [4,6,1378]]))

In [None]:
print(flowdf)

In [None]:
# Define the new names of your columns
newcols = {
    0: 'origin', 
    1: 'destination', 
    2: 'flow'
}
flowdf.rename(columns=newcols, inplace=True)

In [None]:
flowdf['origin'] = flowdf['origin'].astype(str)
flowdf['destination'] = flowdf['destination'].astype(str)

In [None]:
flowdf.info()

In [None]:
print(flowdf)

In [None]:
flowdf.to_pickle('flow_df.pkl')

### 4. Combining months into quarters and merge with longitude and latitude data

In [None]:
# read in Jan - Dec pickled files
m1 = pd.read_pickle('ds3_01_undup.pkl')
m2 = pd.read_pickle('ds3_02_undup.pkl')
m3 = pd.read_pickle('ds3_03_undup.pkl')
m4 = pd.read_pickle('ds3_04_undup.pkl')
m5 = pd.read_pickle('ds3_05_undup.pkl')
m6 = pd.read_pickle('ds3_06_undup.pkl')
m7 = pd.read_pickle('ds3_07_undup.pkl')
m8 = pd.read_pickle('ds3_08_undup.pkl')
m9 = pd.read_pickle('ds3_09_undup.pkl')
m10 = pd.read_pickle('ds3_10_undup.pkl')
m11 = pd.read_pickle('ds3_11_undup.pkl')
m12 = pd.read_pickle('ds3_12_undup.pkl')

In [None]:
# concat the months to form quarters
q1 = pd.concat([m1,m2,m3], ignore_index=True)
q2 = pd.concat([m4,m5,m6], ignore_index=True)
q3 = pd.concat([m7,m8,m9], ignore_index=True)
q4 = pd.concat([m10,m11,m12], ignore_index=True)

In [None]:
q1.shape

In [None]:
q2.shape

In [None]:
q3.shape

In [None]:
q4.shape

In [None]:
# read in the lat and long data
coord = pd.read_csv('/Users/thewooz/Documents/DATA_WEBBASED/ContextData/SITE_ARR_LONLAT.CSV', index_col=0)

In [None]:
coord.head()

In [None]:
# get the average lat and long per neighborhood
coord_mean = coord.groupby(['arr_id']).mean()

In [None]:
coord_mean.head()

In [None]:
coord_mean.shape

In [None]:
q3.shape

In [None]:
q3_coord.shape

In [None]:
q3_coord.head()

In [None]:
# save the quarter data with coordinates

q1_coord.to_pickle('q1_coord.pkl')
q2_coord.to_pickle('q2_coord.pkl')
q3_coord.to_pickle('q3_coord.pkl')
q4_coord.to_pickle('q4_coord.pkl')

### 5. Creating a Flow pattern using Scikit Mobility

In [None]:
import skmob
import geopandas as gpd
import pandas as pd

In [None]:
# read in the flow dataframe
flowdf = pd.read_pickle('/Users/thewooz/Documents/ClassNotes/WebData/SenegalProject/flow_df.pkl')

In [None]:
# read in the shapes
shapes = pd.read_pickle('/Users/thewooz/Documents/ClassNotes/WebData/SenegalProject/senegal.pkl')

In [None]:
# Define new names of columns
shapes.rename(columns={'ARR_ID':'tile_ID'}, inplace=True)

In [None]:
shapes.head(10)

In [None]:
import geopandas as gpd
from skmob import FlowDataFrame 

In [None]:
# identify th3e tessellation file
tessellation = gpd.GeoDataFrame(shapes)

In [None]:
fdf = FlowDataFrame(flowdf,tessellation=tessellation) 

In [None]:
fdf.plot_flows(min_flow=50)

#### NOTE: A bug was reported to the package developers on November 19, 2019 but the developers have not provided a corrected release.