In [4]:
import baltic as bt
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from datetime import datetime as dt
from datetime import timedelta
import time
import pymc3
import math
import arviz as az
import re
#from hpd import hpd
import scipy.stats as stats
from io import StringIO
import altair as alt
from altair import datum
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')


## first calculate introductions

In [5]:
from datetime import date
current_date = str(date.today())

In [6]:
def read_in_migration_rates_mascot(log_file_path):
    
    mig_rates_dict = {"sample":[]}
    
    with open(log_file_path, "r") as infile:
        line_number = 0
        for line in infile:
            line_number += 1
            if not line.startswith("#"):  # log combiner will sometimes put the entire xml at the start of the log file
                # use the first line to find the migration rate columns
                
                if "sample" in line.lower():
                    all_cols = line.split("\t")
                    migration_column_indices = []   # list to store column indices
                    mig_rates_key = {}   # dictionary to store the column index to map to column name
                    counter = 0
                    for i in range(len(all_cols)):
                        
                        col = all_cols[i]
                        if col == "immigrationRate.1":
                            counter = counter + 1
                        if ("immigrationRate" in col) and (counter <2):
                            migration_column_indices.append(i)


                    # make an empty dictionary to store migration rates and generate dictionary to convert index to name
                    for m in migration_column_indices:
                        name = line.split("\t")[m]
                        mig_rates_key[m] = name
                        mig_rates_dict[name] = []
                    
                # read in actual parameter estimates and store in dictionary
                else:
                    sample = line.split("\t")[0]
                    mig_rates_dict["sample"].append(sample)

                    for index in migration_column_indices:
                        name = mig_rates_key[index]
                        mig_rates_dict[name].append(line.split("\t")[index])
                
                
    return(mig_rates_dict)

In [7]:
#log_file_path = "/Users/miguelparedes/Desktop/gitrepos/ncov-king-county/Simulations/Validation-Constant/results/glm_randomkc_clusters_combined_1500.log"
log_file_path = "/Users/miguelparedes/Desktop/glm_multi/rhino/variant_builds/results/glm_randomkc_clusters_combined_3000.log"


In [8]:
migration_rates = read_in_migration_rates_mascot(log_file_path)
mig_df = pd.DataFrame.from_dict(migration_rates)

In [9]:
burnin_percent = 0.1

mig_df = pd.DataFrame.from_dict(migration_rates)
print(len(mig_df))

rows_to_remove = int(len(mig_df)* burnin_percent)
mig_df = mig_df.iloc[rows_to_remove:]

print(len(mig_df))
mig_df = mig_df.reset_index()
mig_df.head()

543
489


Unnamed: 0,index,sample,immigrationRate.1,immigrationRate.2,immigrationRate.3,immigrationRate.4,immigrationRate.5,immigrationRate.6,immigrationRate.7,immigrationRate.8,...,immigrationRate.48,immigrationRate.49,immigrationRate.50,immigrationRate.51,immigrationRate.52,immigrationRate.53,immigrationRate.54,immigrationRate.55,immigrationRate.56,immigrationRate.57
0,54,2700000,0.6167372327623897,3.756328078387172,2.505543871718059,0.644647788956165,3.6919933721983504,1.8405402189329096,3.1826152890466823,2.805886191009435,...,0.3410696696632178,-1.280505801408147,-0.7010436760449685,3.537822285360268,2.4644600026180705,2.428375447704777,3.019437511347051,0.5916057926744429,2.4756080696936973,2.2604922828095284
1,55,2750000,1.171553546835594,3.756328078387172,2.505543871718059,0.644647788956165,3.6919933721983504,1.8405402189329096,3.1826152890466823,2.805886191009435,...,0.3410696696632178,-0.7477825422777871,-0.7010436760449685,3.537822285360268,2.4644600026180705,2.428375447704777,3.019437511347051,0.5916057926744429,2.4756080696936973,2.2604922828095284
2,56,2800000,1.171553546835594,3.756328078387172,2.505543871718059,0.644647788956165,3.6919933721983504,1.8405402189329096,3.1826152890466823,2.805886191009435,...,0.3410696696632178,-0.7477825422777871,0.8968793324137998,3.537822285360268,2.4644600026180705,2.428375447704777,2.95462998570818,0.8831126590628702,2.4756080696936973,2.2604922828095284
3,57,2850000,1.171553546835594,3.756328078387172,2.505543871718059,0.644647788956165,3.6919933721983504,1.8405402189329096,3.1826152890466823,2.805886191009435,...,0.3410696696632178,-0.7477825422777871,0.2589176183882153,3.537822285360268,2.4644600026180705,2.428375447704777,2.95462998570818,0.9280311746896684,2.4756080696936973,2.2604922828095284
4,58,2900000,1.171553546835594,3.756328078387172,2.505543871718059,0.7456704491196315,3.6919933721983504,1.7582833602183774,3.1826152890466823,2.805886191009435,...,0.3410696696632178,-0.7477825422777871,0.2589176183882153,3.537822285360268,2.4644600026180705,2.428375447704777,2.95462998570818,0.9280311746896684,2.4756080696936973,2.803941472156363


In [10]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_mr_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        if "immigrationRate" in i:
            #deme = i.split("_")[1]
            interval = i.split(".")[1]
            next_interval = int(interval)+1
            local_series = input_df[i].astype('float').to_numpy()
            mean_log = local_series.mean()
            mean_linear = np.exp(mean_log)
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            lower_hpd_linear_95 = math.exp(lower_hpd_log_95)
            upper_hpd_log_95 = hpd_95[1]
            upper_hpd_linear_95 = math.exp(upper_hpd_log_95)
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            lower_hpd_linear_50 = math.exp(lower_hpd_log_50)
            upper_hpd_log_50 = hpd_50[1]
            upper_hpd_linear_50 = math.exp(upper_hpd_log_50)
            

            
            
            try:
                local_df = pd.DataFrame.from_dict({"interval":interval, "mean_mr_log":mean_log,"mean_mr_linear":mean_linear, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50,
                                                   "upper_hpd_linear":upper_hpd_linear_95,"lower_hpd_linear":lower_hpd_linear_95,
                                                   "upper_hpd_linear_50":upper_hpd_linear_50,"lower_hpd_linear_50":lower_hpd_linear_50})
                new_df = new_df.append(local_df)
            except:
                pass
            
    return(new_df)

In [11]:
mr_summary = generate_summary_mr_df(mig_df)
mr_summary['days'] = mr_summary.interval.astype(int) *14
mr_summary['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - mr_summary.days.map(timedelta)
mr_summary.date = mr_summary.date.astype(str)

In [12]:
mr_summary

Unnamed: 0,interval,mean_mr_log,mean_mr_linear,upper_hpd_log_95,lower_hpd_log_95,upper_hpd_log_50,lower_hpd_log_50,upper_hpd_linear,lower_hpd_linear,upper_hpd_linear_50,lower_hpd_linear_50,days,date
0,1,3.078469,21.725111,3.846287,1.902737,3.447645,2.891701,46.818912,6.704219,31.426286,18.023946,14,2022-02-20
0,2,2.757764,15.764559,3.500023,2.166131,2.976408,2.5946,33.116221,8.724464,19.617223,13.391231,28,2022-02-06
0,3,2.194821,8.97839,2.764022,1.824187,2.276323,2.021998,15.863511,6.197754,9.740799,7.5534,42,2022-01-23
0,4,2.164396,8.709343,2.927266,0.74567,2.591441,2.142796,18.676503,2.107854,13.348995,8.523239,56,2022-01-09
0,5,3.268858,26.281307,3.710409,2.816338,3.421156,3.047611,40.870508,16.715533,30.604764,21.064969,70,2021-12-26
0,6,2.807697,16.571715,3.251196,1.758283,3.172082,2.815677,25.821204,5.802468,23.857096,16.704489,84,2021-12-12
0,7,3.489267,32.761909,3.686845,3.134114,3.590095,3.418579,39.918693,22.968272,36.237527,30.526021,98,2021-11-28
0,8,2.852901,17.33801,3.360479,1.938599,3.071877,2.845461,28.802985,6.94901,21.582366,17.209496,112,2021-11-14
0,9,2.64759,14.119964,3.293637,2.160559,2.881679,2.476807,26.940664,8.675988,17.844216,11.903199,126,2021-10-31
0,10,2.535407,12.621568,3.387177,1.888011,2.464818,1.99501,29.582329,6.606219,11.761342,7.352278,140,2021-10-17


In [13]:
line = alt.Chart(mr_summary).mark_area(interpolate='monotone').encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_linear_50',axis=alt.Axis(title="introductions", grid=False)),
    alt.Y2('upper_hpd_linear_50' )
).properties(
    width=850,
    height=300
)

band = alt.Chart(mr_summary).mark_area(
    opacity=0.3, interpolate='monotone'
).encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_linear'),
    alt.Y2('upper_hpd_linear')
).properties(
    width=850,
    height=300
)

band + line

## now working on Ne diff

In [14]:
def read_in_Ne_changes_mascot(log_file_path):
    
    Ne_skyline_dict = {"sample":[]}
    
    with open(log_file_path, "r") as infile:
        line_number = 0
        for line in infile:
            line_number += 1
            if not line.startswith("#"):  # log combiner will sometimes put the entire xml at the start of the log file
                # use the first line to find the migration rate columns
            
            # use the first line to find the migration rate columns
                if "posterior" in line:
                    all_cols = line.split("\t")
                    Ne_column_indices = []   # list to store column indices
                    Nes_key = {}   # dictionary to store the column index to map to column name

                    for i in range(len(all_cols)):
                        col = all_cols[i]
                        if "Ne." in col:
                            Ne_column_indices.append(i)

                    # make an empty dictionary to store Nes and generate dictionary to convert index to name
                    for n in Ne_column_indices:
                        name = line.split("\t")[n]
                        deme = name.split(".")[1]# the syntax here is "NeLog.state01" where 0 is deme and 1 is interval 1
                        interval = name.split(".")[2]
                       
                        Nes_key[n] = name
                        Ne_skyline_dict[name] = []


                # read in actual parameter estimates and store in dictionary
                else:
                    sample = line.split("\t")[0]
                    Ne_skyline_dict["sample"].append(sample)

                    for index in Ne_column_indices:
                        name = Nes_key[index]
                        Ne_skyline_dict[name].append(line.split("\t")[index])
                    
                
    return(Ne_skyline_dict)

In [15]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        if "Ne" in i:
            deme = i.split(".")[1]
            #print(deme)
            interval = i.split(".")[2]
            #print(interval)
            #print(i)
            next_interval = int(interval)+1
            local_series = input_df[i].astype('float').to_numpy()
            #print(local_series)
            mean_log = local_series.mean()
            mean_linear = 10**mean_log
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            lower_hpd_linear_95 = 10**lower_hpd_log_95
            upper_hpd_log_95 = hpd_95[1]
            upper_hpd_linear_95 = 10**upper_hpd_log_95
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            lower_hpd_linear_50 = 10**lower_hpd_log_50
            upper_hpd_log_50 = hpd_50[1]
            upper_hpd_linear_50 = 10**upper_hpd_log_50
            
            try:
                next_local_series = input_df["Ne"+"."+ str(deme) +"." + str(next_interval)].astype('float').to_numpy()
                diff_series = np.subtract(local_series, next_local_series)
                #print(local_series)
                #print(next_local_series)
                #print(diff_series)
                diff_mean_log = diff_series.mean()
                diff_hpd_95 = az.hdi(diff_series, 0.95)
                diff_lower_hpd_log_95 = diff_hpd_95[0]
                diff_lower_hpd_linear_95 = math.exp(diff_lower_hpd_log_95)
                diff_upper_hpd_log_95 = diff_hpd_95[1]
                diff_upper_hpd_linear_95 = math.exp(diff_upper_hpd_log_95)
                diff_hpd_50 = az.hdi(diff_series, 0.50)
                diff_lower_hpd_log_50 = diff_hpd_50[0]
                diff_lower_hpd_linear_50 = math.exp(diff_lower_hpd_log_50)
                diff_upper_hpd_log_50 = diff_hpd_50[1]
                diff_upper_hpd_linear_50 = math.exp(diff_upper_hpd_log_50)
            except KeyError:
                pass   
            
            try:
                local_df = pd.DataFrame.from_dict({"deme":deme, "interval":interval, "mean_Ne_log":mean_log,"mean_Ne_linear":mean_linear, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50,
                                                   "upper_hpd_linear":upper_hpd_linear_95,"lower_hpd_linear":lower_hpd_linear_95,
                                                   "diff_mean_Ne_log":diff_mean_log, 
                                                   "diff_upper_hpd_log_95":diff_upper_hpd_log_95,"diff_lower_hpd_log_95":diff_lower_hpd_log_95, 
                                                   "diff_upper_hpd_log_50":diff_upper_hpd_log_50,"diff_lower_hpd_log_50":diff_lower_hpd_log_50,
                                                   "diff_upper_hpd_linear":diff_upper_hpd_linear_95,"diff_lower_hpd_linear":diff_lower_hpd_linear_95,
                                                   "diff_upper_hpd_linear_50":diff_upper_hpd_linear_50,"diff_lower_hpd_linear_50":diff_lower_hpd_linear_50})
                new_df = new_df.append(local_df)
                #print(new_df)
            except:
                pass
            
    return(new_df)

In [16]:
Ne_skyline = read_in_Ne_changes_mascot(log_file_path)

In [17]:
Ne_df = pd.DataFrame.from_dict(Ne_skyline)
print(len(Ne_df))
burnin_percent = 0.1

rows_to_remove = int(len(Ne_df)* burnin_percent)
Ne_df = Ne_df.iloc[rows_to_remove:]

print(len(Ne_df))
Ne_df = Ne_df.reset_index()
Ne_df

543
489


Unnamed: 0,index,sample,Ne.0.0,Ne.0.1,Ne.0.2,Ne.0.3,Ne.0.4,Ne.0.5,Ne.0.6,Ne.0.7,...,Ne.1.789,Ne.1.790,Ne.1.791,Ne.1.792,Ne.1.793,Ne.1.794,Ne.1.795,Ne.1.796,Ne.1.797,Ne.1.798
0,54,2700000,2.3303983693571313,2.5011797357422334,2.6071090660191056,2.7049754437547997,3.0774907917426657,3.1664337310498585,3.3629945119345765,3.803757263184544,...,0.19056408543997258,0.19055592779300493,0.19054773183590137,0.19053949757363542,0.19053122501120337,0.1905229141536248,0.19051456500594247,0.19050617757322197,0.19049775186055212,0.19048928787304492
1,55,2750000,1.950589413799392,2.1112169932056153,2.212148003329347,2.306565788187288,2.6651412667525762,2.7531529745186845,2.9460963284049453,3.3824516020772397,...,0.12566620179970595,0.12566107944379296,0.12565593302159375,0.12565076253608237,0.12564556799024662,0.12564034938708826,0.12563510672962275,0.12562984002087954,0.12562454926390146,0.12561923446174555
2,56,2800000,2.2309273413810713,2.405546090269881,2.5133124667376574,2.611971423079921,2.9960700802887574,3.0858586570897324,3.288719393593682,3.7478280758562748,...,0.19647439707962866,0.19646582957361744,0.19645722183956876,0.19644857388280315,0.19643988570866577,0.19643115732252625,0.1964223887297788,0.1964135799358429,0.19640473094616218,0.1963958417662051
3,57,2850000,2.3389410818709107,2.5211021428295366,2.632710976438961,2.733908049368838,3.133209802279933,3.2245420928236572,3.4341155469431963,3.9094191818204855,...,0.23189315755369583,0.23188315374798474,0.23187310296550637,0.23186300521240658,0.23185286049486017,0.23184266881907004,0.2318324301912678,0.2318221446177137,0.23181181210469629,0.23180143265853292
4,58,2900000,2.2811059473017483,2.457421417631718,2.5642942794887054,2.659834738486044,3.044497279313731,3.1296531343253737,3.329695506487324,3.7848708601517953,...,0.25205531409986603,0.2520434095085997,0.2520314490637275,0.25201943277325456,0.25200736064522267,0.25199523268771123,0.2519830489088363,0.2519708093167513,0.2519585139196466,0.25194616272574966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,538,26900000,0.9739790198144312,1.1096676267916237,1.1937567528809765,1.2687284708931033,1.6052819765599804,1.6789467415179644,1.8674046943953642,2.3315297575521137,...,0.08227342269797006,0.08226876690218439,0.08226408930636003,0.08225938991424549,0.0822546687296067,0.08224992575622662,0.08224516099790592,0.08224037445846222,0.08223556614173075,0.08223073605156381
485,539,26950000,0.9698626962889908,1.1015734886705384,1.1826469719335815,1.2543518890408636,1.577972839215589,1.6475494994126356,1.8271625069960862,2.2684210106093072,...,0.09598705350575423,0.09598158420835662,0.09597608930397514,0.09597056879704356,0.09596502269201618,0.0959594509933676,0.09595385370559299,0.09594823083320803,0.09594258238074879,0.09593690835277195
486,540,27000000,1.1909239483355327,1.5653564444485362,1.8902053437603608,2.203491815090235,2.64196190102173,3.0594123432600058,3.4602336119814714,3.6405287294959567,...,0.10806425995814657,0.10806108322640756,0.10805789153311823,0.10805468487960977,0.10805146326721969,0.10804822669729164,0.10804497517117553,0.10804170869022738,0.10803842725580957,0.10803513086929056
487,541,27050000,1.204445948318848,1.5931552798149407,1.937316031421295,2.258415055197365,2.7540106133780102,3.1891708683004785,3.574598513579466,3.717871174731838,...,0.12599786852512432,0.12599420029537647,0.12599051478820866,0.1259868120051433,0.12598309194770999,0.12597935461744528,0.1259756000158929,0.12597182814460375,0.12596803900513587,0.12596423259905412


In [18]:
ne_summary = generate_summary_df(Ne_df)


In [19]:
test = ne_summary

In [20]:
test['days'] = test.interval.astype(int)
test['date'] = dt.strptime("2021-03-06",  "%Y-%m-%d") - test.days.map(timedelta)
test.date = test.date.astype(str)

In [21]:
test

Unnamed: 0,deme,interval,mean_Ne_log,mean_Ne_linear,upper_hpd_log_95,lower_hpd_log_95,upper_hpd_log_50,lower_hpd_log_50,upper_hpd_linear,lower_hpd_linear,...,diff_upper_hpd_log_95,diff_lower_hpd_log_95,diff_upper_hpd_log_50,diff_lower_hpd_log_50,diff_upper_hpd_linear,diff_lower_hpd_linear,diff_upper_hpd_linear_50,diff_lower_hpd_linear_50,days,date
0,0,0,1.591578,39.046141,2.562461,0.766635,1.606547,0.945526,365.141338,5.842990,...,-0.095356,-4.189948e-01,-0.121490,-0.164048,0.909049,0.657708,0.885600,0.848701,0,2021-03-06
0,0,1,1.764740,58.175507,2.747413,0.895667,1.835763,1.101573,559.001112,7.864434,...,-0.055682,-3.576150e-01,-0.079153,-0.112751,0.945840,0.699342,0.923899,0.893373,1,2021-03-05
0,0,2,1.886124,76.934939,2.961958,0.964134,1.925500,1.125053,916.131258,9.207340,...,-0.049035,-3.380455e-01,-0.066548,-0.105509,0.952147,0.713163,0.935618,0.899866,2,2021-03-04
0,0,3,2.008002,101.859713,3.227163,1.035531,2.151169,1.288563,1687.184288,10.852523,...,0.385840,-5.016194e-01,-0.277629,-0.411816,1.470850,0.605549,0.757577,0.662446,3,2021-03-03
0,0,4,2.262211,182.899056,3.492500,1.120361,2.288209,1.446555,3108.137118,13.193542,...,-0.031042,-4.202926e-01,-0.062819,-0.104090,0.969435,0.656855,0.939114,0.901144,4,2021-03-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1,794,0.164512,1.460536,0.414071,0.026019,0.130624,0.029652,2.594601,1.061743,...,0.000017,6.037845e-07,0.000006,0.000001,1.000017,1.000001,1.000006,1.000001,794,2019-01-02
0,1,795,0.164506,1.460514,0.414048,0.026019,0.130619,0.029650,2.594469,1.061741,...,0.000017,6.065656e-07,0.000006,0.000001,1.000017,1.000001,1.000006,1.000001,795,2019-01-01
0,1,796,0.164499,1.460492,0.414026,0.026018,0.130613,0.029649,2.594336,1.061740,...,0.000017,6.093466e-07,0.000006,0.000001,1.000017,1.000001,1.000006,1.000001,796,2018-12-31
0,1,797,0.164493,1.460470,0.414004,0.026017,0.130608,0.029648,2.594202,1.061738,...,0.000017,6.121274e-07,0.000006,0.000001,1.000017,1.000001,1.000006,1.000001,797,2018-12-30


## below is the dNe/dt by week
### need to add in uninfectious rate


In [22]:
line = alt.Chart(test).mark_area().encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('diff_lower_hpd_log_50',axis=alt.Axis(title="dNe/dt")),
    alt.Y2('diff_upper_hpd_log_50'),
    color=alt.Color('deme:N')
).properties(
    width=850,
    height=300
)

band = alt.Chart(test).mark_area(
    opacity=0.3
).encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('diff_lower_hpd_log_95'),
    alt.Y2('diff_upper_hpd_log_95'),
    color=alt.Color('deme:N')
).properties(
    width=850,
    height=300
)

band + line

## calculating transmission rate

In [23]:
def generate_summary_diff_df(input_df):
    
    
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "Ne" in i:
            deme = i.split(".")[1]
            interval = i.split(".")[2]
            next_interval = int(interval)+1
            local_series = input_df[i].astype('float').to_numpy()
           
            try:
                new_df["Ne"+"."+ str(deme) +".diff." + str(interval)] = (365/7)*(np.log(input_df[i].astype("float")) - np.log(input_df["Ne"+"."+ str(deme) +"." + str(next_interval)].astype('float')))
            
            
            except KeyError:
                pass 
            
            
    return(new_df)

In [24]:
ne_diff_summary = generate_summary_diff_df(Ne_df)

In [25]:
ne_diff_summary

Unnamed: 0,Ne.0.diff.0,Ne.0.diff.1,Ne.0.diff.2,Ne.0.diff.3,Ne.0.diff.4,Ne.0.diff.5,Ne.0.diff.6,Ne.0.diff.7,Ne.0.diff.8,Ne.0.diff.9,...,Ne.1.diff.788,Ne.1.diff.789,Ne.1.diff.790,Ne.1.diff.791,Ne.1.diff.792,Ne.1.diff.793,Ne.1.diff.794,Ne.1.diff.795,Ne.1.diff.796,Ne.1.diff.797
0,-3.687714,-2.162857,-1.921509,-6.727563,-1.485621,-3.140348,-6.421786,1.548322,-2.403928,-0.974209,...,0.002222,0.002232,0.002243,0.002253,0.002264,0.002274,0.002285,0.002296,0.002306,0.002317
1,-4.126219,-2.435042,-2.179350,-7.534503,-1.694107,-3.531856,-7.201959,1.710669,-2.724737,-1.127227,...,0.002115,0.002125,0.002136,0.002146,0.002156,0.002166,0.002176,0.002186,0.002196,0.002206
2,-3.929465,-2.285142,-2.007692,-7.153801,-1.539700,-3.319847,-6.813937,1.675765,-2.511420,-0.986454,...,0.002263,0.002274,0.002285,0.002295,0.002306,0.002317,0.002328,0.002338,0.002349,0.002360
3,-3.910603,-2.258721,-1.966724,-7.108420,-1.498220,-3.283355,-6.759248,1.688555,-2.460220,-0.941207,...,0.002239,0.002249,0.002260,0.002271,0.002281,0.002292,0.002303,0.002313,0.002324,0.002335
4,-3.882151,-2.219758,-1.907422,-7.043029,-1.438433,-3.230703,-6.681112,1.707640,-2.387215,-0.876686,...,0.002451,0.002463,0.002474,0.002486,0.002498,0.002509,0.002521,0.002533,0.002544,0.002556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,-6.800787,-3.808761,-3.176018,-12.268392,-2.339506,-5.547111,-11.574411,3.093918,-3.969485,-1.319259,...,0.002937,0.002951,0.002965,0.002979,0.002993,0.003007,0.003021,0.003035,0.003049,0.003063
485,-6.639891,-3.702952,-3.069332,-11.967933,-2.249861,-5.395492,-11.279531,3.042302,-3.836621,-1.247467,...,0.002957,0.002971,0.002985,0.002999,0.003013,0.003027,0.003042,0.003056,0.003070,0.003084
486,-14.255029,-9.832678,-7.996515,-9.462807,-7.649413,-6.419477,-2.648483,-2.623549,-2.450543,-0.089051,...,0.001526,0.001533,0.001540,0.001547,0.001555,0.001562,0.001569,0.001576,0.001584,0.001591
487,-14.584192,-10.198466,-7.996615,-10.344882,-7.649499,-5.949078,-2.049129,-2.277217,-2.290192,0.209988,...,0.001511,0.001518,0.001525,0.001532,0.001540,0.001547,0.001554,0.001561,0.001568,0.001576


In [26]:
uninfectious_rate = 52.1429

#taken from https://www.medrxiv.org/content/10.1101/2020.09.12.20193284v1.full.pdf 
#the percentages are really dependant on this rate... 

In [27]:
ne_diff_summary += uninfectious_rate

In [28]:
ne_diff_summary

Unnamed: 0,Ne.0.diff.0,Ne.0.diff.1,Ne.0.diff.2,Ne.0.diff.3,Ne.0.diff.4,Ne.0.diff.5,Ne.0.diff.6,Ne.0.diff.7,Ne.0.diff.8,Ne.0.diff.9,...,Ne.1.diff.788,Ne.1.diff.789,Ne.1.diff.790,Ne.1.diff.791,Ne.1.diff.792,Ne.1.diff.793,Ne.1.diff.794,Ne.1.diff.795,Ne.1.diff.796,Ne.1.diff.797
0,48.455186,49.980043,50.221391,45.415337,50.657279,49.002552,45.721114,53.691222,49.738972,51.168691,...,52.145122,52.145132,52.145143,52.145153,52.145164,52.145174,52.145185,52.145196,52.145206,52.145217
1,48.016681,49.707858,49.963550,44.608397,50.448793,48.611044,44.940941,53.853569,49.418163,51.015673,...,52.145015,52.145025,52.145036,52.145046,52.145056,52.145066,52.145076,52.145086,52.145096,52.145106
2,48.213435,49.857758,50.135208,44.989099,50.603200,48.823053,45.328963,53.818665,49.631480,51.156446,...,52.145163,52.145174,52.145185,52.145195,52.145206,52.145217,52.145228,52.145238,52.145249,52.145260
3,48.232297,49.884179,50.176176,45.034480,50.644680,48.859545,45.383652,53.831455,49.682680,51.201693,...,52.145139,52.145149,52.145160,52.145171,52.145181,52.145192,52.145203,52.145213,52.145224,52.145235
4,48.260749,49.923142,50.235478,45.099871,50.704467,48.912197,45.461788,53.850540,49.755685,51.266214,...,52.145351,52.145363,52.145374,52.145386,52.145398,52.145409,52.145421,52.145433,52.145444,52.145456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,45.342113,48.334139,48.966882,39.874508,49.803394,46.595789,40.568489,55.236818,48.173415,50.823641,...,52.145837,52.145851,52.145865,52.145879,52.145893,52.145907,52.145921,52.145935,52.145949,52.145963
485,45.503009,48.439948,49.073568,40.174967,49.893039,46.747408,40.863369,55.185202,48.306279,50.895433,...,52.145857,52.145871,52.145885,52.145899,52.145913,52.145927,52.145942,52.145956,52.145970,52.145984
486,37.887871,42.310222,44.146385,42.680093,44.493487,45.723423,49.494417,49.519351,49.692357,52.053849,...,52.144426,52.144433,52.144440,52.144447,52.144455,52.144462,52.144469,52.144476,52.144484,52.144491
487,37.558708,41.944434,44.146285,41.798018,44.493401,46.193822,50.093771,49.865683,49.852708,52.352888,...,52.144411,52.144418,52.144425,52.144432,52.144440,52.144447,52.144454,52.144461,52.144468,52.144476


In [29]:
north_ne_diff =  ne_diff_summary.filter(regex='Ne.0.')
south_ne_diff =  ne_diff_summary.filter(regex='Ne.1.')



In [30]:
south_ne_diff

Unnamed: 0,Ne.1.diff.0,Ne.1.diff.1,Ne.1.diff.2,Ne.1.diff.3,Ne.1.diff.4,Ne.1.diff.5,Ne.1.diff.6,Ne.1.diff.7,Ne.1.diff.8,Ne.1.diff.9,...,Ne.1.diff.788,Ne.1.diff.789,Ne.1.diff.790,Ne.1.diff.791,Ne.1.diff.792,Ne.1.diff.793,Ne.1.diff.794,Ne.1.diff.795,Ne.1.diff.796,Ne.1.diff.797
0,50.032316,53.709987,49.661898,51.730242,51.063052,52.775359,51.956367,51.360850,53.046157,52.280841,...,52.145122,52.145132,52.145143,52.145153,52.145164,52.145174,52.145185,52.145196,52.145206,52.145217
1,49.763798,53.841224,49.337606,51.605916,50.882628,52.774789,51.839553,51.246789,53.102198,52.224000,...,52.145015,52.145025,52.145036,52.145046,52.145056,52.145066,52.145076,52.145086,52.145096,52.145106
2,49.916679,53.883985,49.539667,51.806170,51.063574,52.919746,52.075096,51.337620,53.174293,52.391020,...,52.145163,52.145174,52.145185,52.145195,52.145206,52.145217,52.145228,52.145238,52.145249,52.145260
3,49.945381,53.932140,49.583906,51.888774,51.124824,52.997049,52.178247,51.363625,53.223698,52.469265,...,52.145139,52.145149,52.145160,52.145171,52.145181,52.145192,52.145203,52.145213,52.145224,52.145235
4,49.987675,54.001673,49.647810,52.006526,51.212419,53.107108,52.324553,51.401299,53.294178,52.580203,...,52.145351,52.145363,52.145374,52.145386,52.145398,52.145409,52.145421,52.145433,52.145444,52.145456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484,48.460230,55.690242,47.941138,52.333700,50.810636,54.260335,53.009768,50.959560,54.445247,53.332358,...,52.145837,52.145851,52.145865,52.145879,52.145893,52.145907,52.145921,52.145935,52.145949,52.145963
485,48.565677,55.663799,48.072382,52.410563,50.898472,54.291789,53.092333,51.009936,54.445625,53.383881,...,52.145857,52.145871,52.145885,52.145899,52.145913,52.145927,52.145942,52.145956,52.145970,52.145984
486,57.384955,42.960496,48.792165,48.755356,53.034029,52.368050,50.011515,44.360826,61.190390,48.921140,...,52.144426,52.144433,52.144440,52.144447,52.144455,52.144462,52.144469,52.144476,52.144484,52.144491
487,57.384840,43.129028,48.873248,48.911851,53.183412,52.647948,50.206812,44.165393,61.629348,49.092661,...,52.144411,52.144418,52.144425,52.144432,52.144440,52.144447,52.144454,52.144461,52.144468,52.144476


In [31]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        if "Ne" in i:
            #deme = i.split("_")[1]
            interval = i.split(".")[3]
            local_series = input_df[i].astype('float').to_numpy()
            mean_percent = local_series.mean()
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            upper_hpd_log_95 = hpd_95[1]
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            upper_hpd_log_50 = hpd_50[1]
            

            
            
            try:
                local_df = pd.DataFrame.from_dict({"interval":interval, "mean_percent":mean_percent, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50})
                new_df = new_df.append(local_df)
            except:
                pass
            
    return(new_df)

In [32]:
test_north = generate_summary_df(north_ne_diff)

In [33]:
test_north

Unnamed: 0,interval,mean_percent,upper_hpd_log_95,lower_hpd_log_95,upper_hpd_log_50,lower_hpd_log_50
0,0,46.442888,50.795321,38.617309,47.983303,46.290327
0,1,48.656682,50.926267,42.971134,49.571142,48.753272
0,2,48.886982,50.347213,43.959767,50.029048,49.052830
0,3,45.855701,57.439708,39.439628,45.968160,42.055447
0,4,49.063801,50.799423,44.625885,50.559421,49.849197
...,...,...,...,...,...,...
0,793,52.145029,52.145769,52.144062,52.145530,52.145021
0,794,52.145039,52.145782,52.144068,52.145543,52.145031
0,795,52.145049,52.145796,52.144073,52.145555,52.145040
0,796,52.145059,52.145809,52.144078,52.145567,52.145050


In [34]:
line = alt.Chart(test_north).mark_area().encode(
    alt.X('interval:O', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_50',axis=alt.Axis(title="dNe/dt")),
    alt.Y2('upper_hpd_log_50'),
    color=alt.Color('deme:N')
).properties(
    width=850,
    height=300
)

band = alt.Chart(test_north).mark_area(
    opacity=0.3
).encode(
    alt.X('interval:O', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_95'),
    alt.Y2('upper_hpd_log_95'),
    color=alt.Color('deme:N')
).properties(
    width=850,
    height=300
)

band + line

## calculating backward migration rates

In [35]:
def read_in_migration_rates_mascot(log_file_path):
    
    mig_rates_dict = {"sample":[]}
    
    with open(log_file_path, "r") as infile:
        line_number = 0
        for line in infile:
            line_number += 1
            if not line.startswith("#"):  # log combiner will sometimes put the entire xml at the start of the log file
                # use the first line to find the migration rate columns
                
                if "sample" in line.lower():
                    all_cols = line.split("\t")
                    migration_column_indices = []   # list to store column indices
                    rate_scaler_indicies = []
                    error_indicies = []
                    individual_scaler = []
                    mig_rates_key = {}   # dictionary to store the column index to map to column name
                    rate_scaler_key = {}
                    error_key = {}
                    individual_scaler_key = {}
                    counter = 0
                    for i in range(len(all_cols)):
                        col = all_cols[i]
                        if col == "immigrationRate.1": #mascot repeats the intro rates which causes an array error. this is done to prevent that
                            counter = counter + 1
                        if ("immigrationRate" or "migrationGLM.1Clock" or "migrationError" or "migrationGLM.1scaler." in col) and (counter <2):
                            migration_column_indices.append(i)

                               
                    # make an empty dictionary to store migration rates and generate dictionary to convert index to name
                    for m in migration_column_indices:
                        name = line.split("\t")[m]
                        mig_rates_key[m] = name
                        mig_rates_dict[name] = []
                    
                # read in actual parameter estimates and store in dictionary
                else:
                    sample = line.split("\t")[0]
                    mig_rates_dict["sample"].append(sample)

                    for index in migration_column_indices:
                        name = mig_rates_key[index]
                        mig_rates_dict[name].append(line.split("\t")[index])
                
                
    return(mig_rates_dict)

In [36]:
migration_rates_f = read_in_migration_rates_mascot(log_file_path)

In [37]:
mig_df_f = pd.DataFrame.from_dict(migration_rates_f)


In [38]:
burnin_percent = 0.1
print(len(mig_df_f))
rows_to_remove = int(len(mig_df_f)* burnin_percent)
mig_df_f = mig_df_f.iloc[rows_to_remove:]

print(len(mig_df_f))
mig_df_f = mig_df_f.reset_index()
#mig_df_f

543
489


In [39]:
def extract_covariate(xml, covar):
    with open(xml, "r") as infile:
            line_number = 0
            for line in infile:
                line_number += 1
                if not line.startswith("#"):  # log combiner will sometimes put the entire xml at the start of the log file
                    # use the first line to find the migration rate columns
                    
                    if covar in line:
                        #print(line)
                        s = r"(?<=>)[^<:]+(?=:?<)"
                        match = re.search(s, line)[0]
                        
                        return match
                    
                    
                        


In [40]:
xml = '/Users/miguelparedes/Desktop/gitrepos/ncov-king-county/Simulations/Validation-Constant/xmls/glm_randomkc_clusters_combined_3000.xml'

In [41]:
npi = extract_covariate(xml, "NPI_dates")
mvmt = extract_covariate(xml, "safegraph_between_total_mvmt")


In [42]:
NPI_dates = npi.split()
NPI_dates = [int(x) for x in NPI_dates]
NPI_dates

safegraph_between_total_mvmt = mvmt.split()
safegraph_between_total_mvmt = [float(x) for x in safegraph_between_total_mvmt]

In [43]:
len(safegraph_between_total_mvmt)

1602

In [44]:
predictors = {"npi_dates": NPI_dates, "mvmt":safegraph_between_total_mvmt }
predict_df = pd.DataFrame(predictors)
predict_df['log_mvmt'] = np.log(predict_df.mvmt)
predict_df['std_log_mvmt'] = (predict_df.log_mvmt - predict_df.log_mvmt.mean())/ predict_df.log_mvmt.std()
#log standarization


In [45]:
predict_df


Unnamed: 0,npi_dates,mvmt,log_mvmt,std_log_mvmt
0,0,1.397995e+06,14.150550,0.69729
1,0,1.397995e+06,14.150550,0.69729
2,0,1.397995e+06,14.150550,0.69729
3,0,1.397995e+06,14.150550,0.69729
4,0,1.397995e+06,14.150550,0.69729
...,...,...,...,...
1597,0,1.885497e+06,14.449702,2.39882
1598,0,1.885497e+06,14.449702,2.39882
1599,0,1.885497e+06,14.449702,2.39882
1600,0,1.885497e+06,14.449702,2.39882


In [46]:
mig_rates = {}
counter_n = 0
counter_s = 0
for index_1, row_1 in predict_df.iterrows():
    

    if index_1 %2 == 0:
        mig_rates[str(counter_n) + "_N_to_S_b"] = []
        for index_2, row_2 in mig_df_f.iterrows():
        
            mig_rate_base = (float(row_1.std_log_mvmt)*float(row_2['migrationGLM.1scaler.safegraph_between_total_mvmt']))+ (float(row_1.npi_dates)*float(row_2["migrationGLM.1scaler.NPI_dates"])) + (float(row_2['migrationErrorGLM.1.1'])) 
            mig_rate_f = (math.exp(mig_rate_base)) * (float(row_2['migrationGLM.1Clock']))
            mig_rate_b = mig_rate_f* ((float(row_2["Ne.0."+ str(counter_n)]))/(float(row_2["Ne.1."+ str(counter_n)])))
            mig_rates[str(counter_n) + "_N_to_S_b"].append(mig_rate_b)   
            
        counter_n= counter_n+1
                
    else:
        mig_rates[str(counter_s) + "_S_to_N_b"] = []
        for index_2, row_2 in mig_df_f.iterrows():
        
            mig_rate_base = (float(row_1.std_log_mvmt)*float(row_2['migrationGLM.1scaler.safegraph_between_total_mvmt']))+ (float(row_1.npi_dates)*float(row_2["migrationGLM.1scaler.NPI_dates"])) + (float(row_2['migrationErrorGLM.1.2'])) 
            mig_rate_f = (math.exp(mig_rate_base)) * (float(row_2['migrationGLM.1Clock']))
            mig_rate_b = mig_rate_f* ((float(row_2["Ne.1."+ str(counter_s)]))/(float(row_2["Ne.0."+ str(counter_s)])))
            mig_rates[str(counter_s) + "_S_to_N_b"].append(mig_rate_b)
        counter_s = counter_s +1
    #mig_rates[index_1].append(interval_rate)
    

KeyError: 'Ne.0.799'

In [None]:
mr_b_df = pd.DataFrame(mig_rates)

In [None]:
mr_b_df

In [None]:
north_mrb =  mr_b_df.filter(regex='S_to_N')
south_mrb =  mr_b_df.filter(regex='N_to_S')

In [None]:
north_mrb

In [None]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        #print(i)
        if ("_N_to_S_b" in str(i)) | ("_S_to_N_b" in str(i)):
            #deme = i.split("_")[1]
            interval = i.split("_")[0]
            local_series = input_df[i].astype('float').to_numpy()
            mean_percent = local_series.mean()
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            upper_hpd_log_95 = hpd_95[1]
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            upper_hpd_log_50 = hpd_50[1]


            
            
            try:
                local_df = pd.DataFrame.from_dict({"interval":interval, "mean_percent":mean_percent, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50})
                new_df = new_df.append(local_df)
            except:
                pass
            
    return(new_df)

In [None]:
north_mrb_df = generate_summary_df(north_mrb)
south_mrb_df = generate_summary_df(south_mrb)

In [None]:
south_mrb_df['days'] = south_mrb_df.interval.astype(int) 
south_mrb_df['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - south_mrb_df.days.map(timedelta)
south_mrb_df.date = south_mrb_df.date.astype(str)

north_mrb_df['days'] = north_mrb_df.interval.astype(int)
north_mrb_df['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - north_mrb_df.days.map(timedelta)
north_mrb_df.date = north_mrb_df.date.astype(str)

In [None]:
south_mrb_plot = alt.Chart(south_mrb_df, width = 750).mark_area(interpolate='monotone', opacity = 1.0, color = "orange").encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('upper_hpd_log_95',axis=alt.Axis(title="introductions", grid=False)),
    alt.Y2('lower_hpd_log_95' )
).properties(
    width=800,
    height=300
)

In [None]:
north_mrb_plot = alt.Chart(north_mrb_df).mark_area(interpolate='monotone', opacity = 1.0).encode(
    alt.X('date:T', axis=alt.Axis(title=None, grid=False)),
    alt.Y('lower_hpd_log_95',axis=alt.Axis(title="introductions", grid=False)),
    alt.Y2('upper_hpd_log_95' )
).properties(
    width=800,
    height=300
)

In [None]:
north_mrb_plot + south_mrb_plot

### North

In [None]:
def generate_north_intro_df(input_df):
    
    
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "S_to_N" in i:
            interval = i.split("_")[0]
            print(interval)
            print(i)
            
            immigration_interval = math.ceil((int(interval)+1)/14)
            print(immigration_interval)

            
            try:
                new_df["intro"+".north." + str(interval)] = input_df[i].astype("float") + mig_df["immigrationRate."+str(immigration_interval)].astype('float').map(math.exp)
                #print(input_df[i].astype("float"))
                #print(mig_df["immigrationRate."+str(immigration_interval)].astype('float'))
            
            except KeyError:
                pass 
            
            
    return(new_df)

In [None]:
intro_north_df = generate_north_intro_df(north_mrb)

In [None]:
intro_north_df

In [None]:
def generate_percent_intro_df(input_df):
    
    temp_df = pd.DataFrame()
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "north" in i:
            interval = i.split(".")[2]
            print(interval)
            print(i)
            

            
            try:
                temp_df["total."+ str(interval)] = north_ne_diff["Ne.0.diff." + str(interval)].astype("float") +  input_df[i].astype("float")

                new_df["intro.percent"+".north." + str(interval)] = input_df[i].astype("float").div(temp_df["total."+ str(interval)], axis = 0) 
            
            
            except KeyError:
                pass 
            
            
    return(new_df)

In [None]:
percent_df_north = generate_percent_intro_df(intro_north_df)

In [None]:
percent_df_north

In [None]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        if "percent" in i:
            #deme = i.split("_")[1]
            interval = i.split(".")[3]
            local_series = input_df[i].astype('float').to_numpy()
            mean_percent = local_series.mean()
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            upper_hpd_log_95 = hpd_95[1]
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            upper_hpd_log_50 = hpd_50[1]
            

            
            
            try:
                local_df = pd.DataFrame.from_dict({"interval":interval, "mean_percent":mean_percent, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50})
                new_df = new_df.append(local_df)
            except:
                pass
            
    return(new_df)

In [None]:
final_north_df = generate_summary_df(percent_df_north)

In [None]:
final_north_df

In [None]:
final_north_df['days'] = final_north_df.interval.astype(int)
final_north_df['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - final_north_df.days.map(timedelta)
final_north_df = final_north_df[final_north_df.date >"2020-01-31"]
final_north_df.date = final_north_df.date.astype(str)

In [None]:
final_north_df.to_csv("north_percent_intro.csv")

In [None]:
line1 = alt.Chart(final_north_df).mark_area(interpolate='monotone').encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_50',axis=alt.Axis(title="", grid=False)),
    alt.Y2('upper_hpd_log_50' )
).properties(
    width=1850,
    height=300
).transform_filter(
    (datum.lower_hpd_log_50 >0) & (datum.upper_hpd_log_50 < 1)
)

band1 = alt.Chart(final_north_df).mark_area(
    opacity=0.3, interpolate='monotone'
).encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_95', axis=alt.Axis(title="", grid=False)),
    alt.Y2('upper_hpd_log_95')
).properties(
    width=1850,
    height=300
).transform_filter(
    (datum.lower_hpd_log_95 >0) & (datum.upper_hpd_log_95 < 0.7)
)

band1 + line1

## South

In [None]:
def generate_south_intro_df(input_df):
    
    
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "N_to_S" in i:
            interval = i.split("_")[0]
            #print(interval)
            #print(i)
            
            immigration_interval = math.ceil((int(interval)+1)/14)
            #print(immigration_interval)

            
            try:
                new_df["intro"+".south." + str(interval)] = input_df[i].astype("float") + mig_df["immigrationRate."+str(immigration_interval)].astype('float').map(math.exp)
            
            
            except KeyError:
                pass 
            
            
    return(new_df)

In [None]:
intro_south_df = generate_south_intro_df(south_mrb)

In [None]:
intro_south_df

In [None]:
def generate_percent_intro_s_df(input_df):
    
    temp_df = pd.DataFrame()
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "south" in i:
            interval = i.split(".")[2]
            #print(interval)
            #print(i)
            

            
            try:
                temp_df["total."+ str(interval)] = south_ne_diff["Ne.1.diff." + str(interval)].astype("float") + input_df[i].astype("float")
                #print(south_ne_diff["Ne.1.diff." + str(interval)].astype("float"))
               # print(input_df[i].astype("float"))
                new_df["intro.percent"+".south." + str(interval)] = input_df[i].astype("float").div(temp_df["total."+ str(interval)], axis = 0) 
            
            
            except KeyError:
                pass 
            
            
    return(new_df)

In [None]:
percent_df_south = generate_percent_intro_s_df(intro_south_df)

In [None]:
percent_df_south

In [None]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        if "percent" in i:
            #deme = i.split("_")[1]
            interval = i.split(".")[3]
            local_series = input_df[i].astype('float').to_numpy()
            mean_percent = local_series.mean()
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            upper_hpd_log_95 = hpd_95[1]
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            upper_hpd_log_50 = hpd_50[1]
            

            
            
            try:
                local_df = pd.DataFrame.from_dict({"interval":interval, "mean_percent":mean_percent, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50})
                new_df = new_df.append(local_df)
            except:
                pass
            
    return(new_df)

In [None]:
final_south_df = generate_summary_df(percent_df_south)

In [None]:
final_south_df

In [None]:
final_south_df['days'] = final_south_df.interval.astype(int)
final_south_df['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - final_south_df.days.map(timedelta)
final_south_df = final_south_df[final_south_df.date >"2020-01-31"]
final_south_df.date = final_south_df.date.astype(str)

In [None]:
final_south_df.to_csv("south_percent_intro.csv")

In [None]:
line2 = alt.Chart(final_south_df).mark_area(interpolate='monotone', opacity = 1 ,color = "#f58518").encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_50',axis=alt.Axis(title="Percent of cases due to introductions", grid=False)),
    alt.Y2('upper_hpd_log_50' )
).properties(
    width=1850,
    height=300
).transform_filter(
    (datum.lower_hpd_log_50 >0) & (datum.upper_hpd_log_50 < 1)
)

band2 = alt.Chart(final_south_df).mark_area(
    opacity=0.3, interpolate='monotone', color = "#f58518"
).encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_95', axis=alt.Axis(title="", grid=False)),
    alt.Y2('upper_hpd_log_95')
).properties(
    width=1850,
    height=300
).transform_filter(
    (datum.lower_hpd_log_95 >0) & (datum.upper_hpd_log_95 < 0.7)
)

band2 + line2

In [None]:
band1+ line1+ band2+ line2

In [None]:
#highlighting important NPIs in WA
data = {'date': [ "2020-03-23", "2020-06-01", "2020-11-18", "2021-02-14"], 'event':[ "Stay at home", "Stay at home lifted", "Closing restaurants", "Reopening restaurants"]}

npidf = pd.DataFrame(data)
npidf.date = pd.to_datetime(npidf.date)

rule = alt.Chart(npidf).mark_rule(
    color="black",
    strokeWidth=2, 
    opacity = 0.3
).encode(
    alt.X('date:T', axis=alt.Axis(title=None))
).properties(
    width=1850,
    height=300
)

text = alt.Chart(npidf).mark_text(
    align='left',
    baseline='middle',
    dx=2,
    dy=-135,
    size=10
).encode(
    alt.X('date:T',axis=alt.Axis(title=None)),
    text='event',
    color=alt.value('#000000')
).properties(
    width=1850,
    height=300
)

In [None]:
band1+ line1+ band2+ line2 + text + rule

In [None]:
percent_case_intro = band1+ line1+ band2+ line2 + text + rule

In [None]:
#percent_case_intro.save("percent_case_intro.png")

In [None]:
# exploratory work to look into introductions within / introductions from outside

In [None]:
def generate_north_intro_prop(input_df):
    
    
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "S_to_N" in i:
            interval = i.split("_")[0]
            immigration_interval = math.ceil(int(interval+1)/14)

            try:
                new_df["intro"+".north." + str(interval)] = input_df[i].astype("float")/ (mig_df["immigrationRate."+str(immigration_interval)].astype('float').map(math.exp))
      
            except KeyError:
                pass 
            
            
    return(new_df)

In [None]:
intro_north_df = generate_north_intro_prop(north_mrb)

In [None]:
def generate_south_intro_df(input_df):
    
    
    new_df = pd.DataFrame()
   
    for i in input_df.columns.tolist():
        if "N_to_S" in i:
            interval = i.split("_")[0]
            immigration_interval = math.ceil(int(interval)/14)

           
            try:
                new_df["intro"+".south." + str(interval+1)] = input_df[i].astype("float") / mig_df["immigrationRate."+str(immigration_interval)].astype('float').map(math.exp)
                #print(input_df[i].astype("float"))
                #print(mig_df["immigrationRate."+str(immigration_interval)].astype('float'))
            
            except KeyError:
                pass 
            
            
    return(new_df)

In [None]:
intro_south_df = generate_south_intro_df(south_mrb)

In [None]:
# make a new dataframe that summarizes the 95% HPD estimate with mean for each deme and interval 
def generate_summary_df(input_df):
    
    
    new_df = pd.DataFrame()

    for i in input_df.columns.tolist():
        if "intro" in i:
            #deme = i.split("_")[1]
            interval = i.split(".")[2]
            local_series = input_df[i].astype('float').to_numpy()
            mean_percent = local_series.mean()
            hpd_95 = az.hdi(local_series, 0.95)
            lower_hpd_log_95 = hpd_95[0]
            upper_hpd_log_95 = hpd_95[1]
            hpd_50 = az.hdi(local_series, 0.50)
            lower_hpd_log_50 = hpd_50[0]
            upper_hpd_log_50 = hpd_50[1]
            

            
            
            try:
                local_df = pd.DataFrame.from_dict({"interval":interval, "mean":mean_percent, 
                                                   "upper_hpd_log_95":upper_hpd_log_95,"lower_hpd_log_95":[lower_hpd_log_95], 
                                                   "upper_hpd_log_50":upper_hpd_log_50,"lower_hpd_log_50":lower_hpd_log_50})
                new_df = new_df.append(local_df)
            except:
                pass
            
    return(new_df)

In [None]:
ratio_intro_n = generate_summary_df(intro_north_df)
ratio_intro_s = generate_summary_df(intro_south_df)

In [None]:
ratio_intro_n['days'] = ratio_intro_n.interval.astype(int) *7
ratio_intro_n['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - ratio_intro_n.days.map(timedelta)
ratio_intro_n.date = ratio_intro_n.date.astype(str)

ratio_intro_s['days'] = ratio_intro_s.interval.astype(int) * 7 
ratio_intro_s['date'] = dt.strptime("2022-03-06",  "%Y-%m-%d") - ratio_intro_s.days.map(timedelta)
ratio_intro_s.date = ratio_intro_s.date.astype(str)

In [None]:
line2 = alt.Chart(ratio_intro_n).mark_area(interpolate='monotone').encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_50',axis=alt.Axis(title="percent of cases due to introductions", grid=False)),
    alt.Y2('upper_hpd_log_50' )
).properties(
    width=850,
    height=300
)

band2 = alt.Chart(ratio_intro_n).mark_area(
    opacity=0.3, interpolate='monotone', 
).encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_95', axis=alt.Axis(title=None, grid=False)),
    alt.Y2('upper_hpd_log_95')
).properties(
    width=850,
    height=300
)

band2 + line2

In [None]:
line1 = alt.Chart(ratio_intro_s).mark_area(interpolate='monotone', color = "orange").encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_50',axis=alt.Axis(title="percent of cases due to introductions", grid=False)),
    alt.Y2('upper_hpd_log_50' )
).properties(
    width=850,
    height=300
)

band1 = alt.Chart(ratio_intro_s).mark_area(
    opacity=0.3, interpolate='monotone', color = "orange"
).encode(
    alt.X('date:T', axis=alt.Axis(title="Date", grid=False)),
    alt.Y('lower_hpd_log_95', axis=alt.Axis(title=None, grid=False)),
    alt.Y2('upper_hpd_log_95')
).properties(
    width=850,
    height=300
)

band1 + line1

In [None]:
band1 + line1 + band2 + line2

In [None]:
#growth rates over time -- not used
test = test.reset_index()
regions = ['north', "south"]
test['growth_rate'] = np.nan
for region in regions:
    for index, row in test.iterrows():
        if row.deme in region: 
            try:
                #print(index)
                old_index = int(index) +1
                current_ne = np.log(float(test.iloc[index, 3]))
                print(current_ne)
                old_ne = np.log(float(test.iloc[old_index, 3]))
                #print(old_index)
                test.iloc[index, 13] = (365/7)*(current_ne - old_ne)
            except IndexError:
                print(index)
                test.iloc[index, 13] = np.nan
        else:
            pass