## mobility exploration 
### we want to use mobility data not only to be more confident in our phylodynamic results but also to try to understand why we find more introductions into North KC vs south kc and where they are coming from. 
### The first thing is to calculate the proportion of introductions from N or S to the other vs the introductions coming from the outside
### then it'd be great to look at where the introductions in to each area are coming from I.e. from which state
### lastly, the idea is that the biggest reduction in mobility happened in the tech sector in seattle so look at mobility changes in N and S by industry. 

In [None]:
import pandas as pd
import numpy as np
import altair as alt
from datetime import datetime as dt
alt.data_transformers.disable_max_rows()

In [None]:
within_mob = '../data-files/movement_within_king_co.tsv'
outside_state = '../data-files/north_vs_south_king_co_visitors_by_state (1).tsv'
within_industry = '../data-files/movement_within_king_co_by_industry.tsv'
outside_industry = '../data-files/north_vs_south_king_co_visitors_by_state_and_industry.tsv'

In [None]:
within_df = pd.read_csv(within_mob, sep='\t', parse_dates = [1])
outside_state_df = pd.read_csv(outside_state, sep='\t', parse_dates = [1])
within_industry_df = pd.read_csv(within_industry, sep='\t', parse_dates = [1])
outside_industry_df = pd.read_csv(outside_industry, sep='\t', parse_dates = [1])

In [None]:
within_df

In [None]:
outside_state_df

In [None]:
within_industry_df

In [None]:
outside_industry_df

In [None]:
withinonly = within_df[(within_df.category == "within North") | (within_df.category == "within South")]

In [None]:
alt.Chart(withinonly).mark_line().encode(
    alt.X('start_date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('sum_visits:Q')).properties(
    width=800,
    height=400
)

## proportion of cases from north or south vs from outside

In [None]:
#within_df = within_df[(within_df.category == "North to South") | (within_df.category == "South to North")]

In [None]:
within_df = within_df[within_df.start_date > "2020-01-01"]
outside_state_df = outside_state_df[outside_state_df.start_date > "2020-01-01"]

In [None]:
within_df

In [None]:
ns_within_df = within_df[(within_df.category == "North to South")]
sn_within_df = within_df[(within_df.category == "South to North")]
n_within_df = within_df[(within_df.category == "within North")]
s_within_df = within_df[(within_df.category == "within South")]


In [None]:
#for some reason, it's always weekly summaries until '2021-07-19', '2021-07-21' where it only does 3 days and then conintues the weekly pattern,
#which leads to one extra entry, we're droping 07-21 for now but will follow up with amanda later about

ns_within_df = ns_within_df[(ns_within_df.start_date != "2021-07-21")].reset_index()
sn_within_df = sn_within_df[(sn_within_df.start_date != "2021-07-21")].reset_index()
n_within_df = n_within_df[(n_within_df.start_date != "2021-07-21")].reset_index()
s_within_df = s_within_df[(s_within_df.start_date != "2021-07-21")].reset_index()

In [None]:
s_within_df

In [None]:
n_outside_df = outside_state_df[(outside_state_df.poi_geo == "North King")]
s_outside_df = outside_state_df[(outside_state_df.poi_geo == "South King")]

In [None]:
n_outside_df_all = n_outside_df.groupby("start_date")["sum_visits"].sum().reset_index().sort_values('start_date')
s_outside_df_all = s_outside_df.groupby("start_date")["sum_visits"].sum().reset_index().sort_values('start_date')


In [None]:
n_outside_df_all


In [None]:
s_outside_df_all

In [None]:
within_v_outside = pd.DataFrame()
percent_intro_est = pd.DataFrame()

In [None]:
within_v_outside['date'] = n_outside_df_all.start_date
percent_intro_est['date'] = n_outside_df_all.start_date

In [None]:
within_v_outside['ratio_n'] = sn_within_df['sum_visits'].astype(float)/n_outside_df_all['sum_visits'].astype(float)
within_v_outside['ratio_s'] = ns_within_df['sum_visits'].astype(float)/s_outside_df_all['sum_visits'].astype(float)

percent_intro_est['north'] = (sn_within_df['sum_visits'].astype(float) + n_outside_df_all['sum_visits'].astype(float))/(sn_within_df['sum_visits'].astype(float) + n_outside_df_all['sum_visits'].astype(float) + n_within_df['sum_visits'].astype(float))
percent_intro_est['south'] = (ns_within_df['sum_visits'].astype(float) + s_outside_df_all['sum_visits'].astype(float))/(ns_within_df['sum_visits'].astype(float) + s_outside_df_all['sum_visits'].astype(float) + s_within_df['sum_visits'].astype(float))

In [None]:
within_v_outside

In [None]:
percent_intro_est

In [None]:
percent_intro_est.to_csv("percent_intro_from_mob.csv")

In [None]:
north_intro = alt.Chart(within_v_outside, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('ratio_n:Q',axis=alt.Axis(title="ratio within vs out - north", grid=False))).properties(
    width=800,
    height=400
)

In [None]:
south_intro = alt.Chart(within_v_outside, width = 750).mark_line(interpolate='monotone', opacity = 1.0, color = "orange").encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('ratio_s:Q',axis=alt.Axis(title="ratio within vs out - south", grid=False))).properties(
    width=800,
    height=400
)

In [None]:
north_intro + south_intro

In [None]:
n_percent = alt.Chart(percent_intro_est, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('north:Q',axis=alt.Axis(title="percent cases from intro -mobility", grid=False))).properties(
    width=800,
    height=400
)

s_percent = alt.Chart(percent_intro_est, width = 750).mark_line(interpolate='monotone', opacity = 1.0,  color = "orange").encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('south:Q',axis=alt.Axis(title="percent cases from intro -mobiliity", grid=False))).properties(
    width=800,
    height=400
)

percent_cases_from_intro_mob = n_percent+s_percent
percent_cases_from_intro_mob

In [None]:
percent_cases_from_intro_mob.save('percent_cases_from_intro_mobility.png')

### how do the numbers of intro into n v s compare? 

In [None]:
outside_df_all = outside_state_df.groupby(["poi_geo", "start_date"])["sum_visits"].sum().reset_index().sort_values('start_date')


In [None]:
outside_df_all

In [None]:
alt.Chart(outside_df_all, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    alt.X('start_date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('sum_visits:Q',axis=alt.Axis(title="visits", grid=False)),
    alt.Color('poi_geo:N')).properties(
    width=800,
    height=400
)

In [None]:
outside_df_short = outside_df_all[outside_df_all.start_date > "2020-12-03"]

In [None]:
alt.Chart(outside_df_short, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    alt.X('start_date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('sum_visits:Q',axis=alt.Axis(title="visits", grid=False)),
    alt.Color('poi_geo:N')).properties(
    width=800,
    height=400
)

## Now gonna work on seeing where the introductions are coming from 

In [None]:
outside_state_df_test = outside_state_df.sort_values(["poi_geo", "start_date" ,"sum_visits"])
#need to change to decending order and then just take the 


In [None]:
outside_state_df_test

In [None]:
north_state = outside_state_df_test[(outside_state_df_test.poi_geo == "North King") & (outside_state_df_test.visitor_state != "WA") ]
south_state = outside_state_df_test[(outside_state_df_test.poi_geo == "South King") & (outside_state_df_test.visitor_state != "WA") ]

In [None]:
north_heat = alt.Chart(north_state).mark_rect(interpolate='monotone').encode(
    alt.Y('visitor_state:N'),
    alt.X('start_date:T'), 
    alt.Color('sum_visits:Q'))

In [None]:
south_heat = alt.Chart(south_state).mark_rect(interpolate='monotone').encode(
    alt.Y('visitor_state:N'),
    alt.X('start_date:T'), 
    alt.Color('sum_visits:Q'))

In [None]:
north_heat

In [None]:
south_heat

In [None]:
(north_heat | south_heat).resolve_scale(y = "independent")

# mobility changes in north and south by industry

In [None]:
within_industry_df

In [None]:
test = within_industry_df[within_industry_df.poi_geo == "North King"].sort_values(["start_date","sum_visits"])

In [None]:
test[test.start_date == "2019-10-21"].industry.tolist()

In [None]:
i_within_df = within_industry_df[(within_industry_df.category == "North to South") | (within_industry_df.category == "South to North")]
i_within_df = i_within_df[i_within_df.industry == "Full-Service Restaurants"]

In [None]:
i_within_df

In [None]:
ns_within_df_res = i_within_df[(i_within_df.category == "North to South")].reset_index()
sn_within_df_res = i_within_df[(i_within_df.category == "South to North")].reset_index()

In [None]:
outside_industry_df = outside_industry_df[outside_industry_df.industry == "Full-Service Restaurants"]

In [None]:
n_outside_df_res = outside_industry_df[(outside_industry_df.poi_geo == "North King")]
s_outside_df_res = outside_industry_df[(outside_industry_df.poi_geo == "South King")]

In [None]:
outside_industry_df

In [None]:
n_outside_df_all_res = n_outside_df_res.groupby("start_date")["sum_visits"].sum().reset_index().sort_values('start_date')
s_outside_df_all_res = s_outside_df_res.groupby("start_date")["sum_visits"].sum().reset_index().sort_values('start_date')


In [None]:
within_v_outside_res = pd.DataFrame()
within_v_outside_res['date'] = n_outside_df_all_res.start_date


In [None]:
within_v_outside_res['ratio_n'] = sn_within_df_res['sum_visits'].astype(float)/n_outside_df_all_res['sum_visits'].astype(float)
within_v_outside_res['ratio_s'] = ns_within_df_res['sum_visits'].astype(float)/s_outside_df_all_res['sum_visits'].astype(float)

In [None]:
sn_within_df_res

In [None]:
n_outside_df_all_res

In [None]:
within_v_outside_res

In [None]:
north_intro = alt.Chart(within_v_outside_res, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('ratio_n:Q',axis=alt.Axis(title="ratio within vs out - north", grid=False))).properties(
    width=800,
    height=400
)

In [None]:
south_intro = alt.Chart(within_v_outside_res, width = 750).mark_line(interpolate='monotone', opacity = 1.0, color = "orange").encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('ratio_s:Q',axis=alt.Axis(title="ratio within vs out - south", grid=False))).properties(
    width=800,
    height=400
)

In [None]:
north_intro + south_intro

## plotting mobility over time for N and south using average of 2019 as a baseline

In [None]:
within_mob = within_df[(within_df.category == "within North") | (within_df.category == "within South")]
within_mob = within_mob[within_mob.start_date <= "2022-03-08"]


In [None]:
within_mob

In [None]:
north_df = within_mob[within_mob.category == "within North"]
south_df = within_mob[within_mob.category == "within South"]
north_df.sum_visits = north_df.sum_visits.astype(float)
south_df.sum_visits = south_df.sum_visits.astype(float)

In [None]:
def convert_format_year(number):
    date = dt.strptime(number, '%Y-%m-%d')
    date_new = dt.strftime(date, '%Y')
    return date_new

In [None]:
north_df["year"] = north_df['start_date'].map(convert_format_year)
south_df["year"] = south_df['start_date'].map(convert_format_year)

In [None]:
n_2019_mean = north_df[north_df['year'] =="2019"].sum_visits.mean()
s_2019_mean = south_df[south_df['year'] =="2019"].sum_visits.mean()

In [None]:
north_df = north_df[north_df.start_date >= "2020-01-01"]
south_df = south_df[south_df.start_date >= "2020-01-01"]
north_df['normalized_visits'] = north_df.sum_visits.divide(n_2019_mean) 
south_df['normalized_visits'] = south_df.sum_visits.divide(s_2019_mean) 

In [None]:
nmob_total = pd.concat([north_df, south_df])

In [None]:
nmob_total.to_csv("../data-files/total_normalized_mobility.csv")

In [None]:
lineplot_mob_n =  alt.Chart(nmob_total, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    x=alt.X('start_date:T', axis=alt.Axis(title=None, grid=False)),
    y=alt.Y('normalized_visits:Q',axis=alt.Axis(title="Percent Change in Mobility Compared to 2019", grid=False)),
    color = alt.Color("poi_geo")).properties(
    width=800,
    height=400
)
line = alt.Chart(pd.DataFrame({'y': [1.0]})).mark_rule(strokeDash=[1,1]).encode(y='y')

In [None]:
lineplot_mob_n

In [None]:
lineplot_mob_n =  alt.Chart(north_df, width = 750).mark_line(interpolate='monotone', opacity = 1.0).encode(
    x=alt.X('start_date:T', axis=alt.Axis(title=None, grid=False)),
    y=alt.Y('normalized_visits:Q',axis=alt.Axis(title="Percent Change in Mobility Compared to 2019", grid=False))).properties(
    width=800,
    height=400
)

lineplot_mob_s =  alt.Chart(south_df, width = 750).mark_line(interpolate='monotone', opacity = 1.0, color = 'orange').encode(
    x=alt.X('start_date:T', axis=alt.Axis(title=None, grid=False)),
    y=alt.Y('normalized_visits:Q',axis=alt.Axis(title="Percent Change in Mobility Compared to 2019", grid=False))).properties(
    width=800,
    height=400
)

line = alt.Chart(pd.DataFrame({'y': [1.0]})).mark_rule(strokeDash=[1,1]).encode(y='y')

In [None]:
change_mob = lineplot_mob_n + lineplot_mob_s + line
change_mob

In [None]:
change_mob.save('change_mob.png')