## script for combining each cluster assignment and metadata into one file for feeding into MASCOT GLM 
### needs to be repeated for each variant build (Alpha, Delta, Omicron, Other)

In [5]:
import pandas as pd
import dendropy
import numpy as np
import re
import random

In [6]:
variant = ["alpha", "delta", "omicron", "other"]
for name in variant: 
    newickpath = "../../nextstrain_build/results/"+name+"_tree.nwk"
    clusters = "../../nextstrain_build/cluster_assignment/"+name+"_new_cluster_assignment.tsv"
    metadata = "../../nextstrain_build/results/"+name+"_sub_subsampled_metadata.tsv"
    
    #read in all three files
    tree2 = dendropy.Tree.get(file=open(newickpath, "r"), schema="newick")
    df = pd.read_csv(clusters, sep ="\t", index_col = 'strain')
    meta_df = pd.read_csv(metadata, sep ="\t", index_col = 'strain')
    meta_df["variant"] = metadata.split("/")[-1].split("_")[0]

    meta_df['ns_kc'] = meta_df['ns_kc'].replace("other_King County","Other_King_County")
    df["location"] = np.nan
    df['date'] = np.nan
    df['variant'] = np.nan
    df['nextstrain_clade'] = np.nan
    
    #for each leaf in the tree, add metadata information into cluster dataset. 
    for leaf in tree2.leaf_node_iter():
        taxon_number = re.findall(r"'(.*?)'", str(leaf.taxon), re.DOTALL)
        if taxon_number[0] in list(df.index):
            df.loc[taxon_number[0], 'date'] =  meta_df.loc[taxon_number[0], 'date']
            df.loc[taxon_number[0], 'location'] =  meta_df.loc[taxon_number[0], 'ns_kc']
            df.loc[taxon_number[0], 'variant'] =  meta_df.loc[taxon_number[0], 'variant']
            df.loc[taxon_number[0], 'nextstrain_clade'] =  meta_df.loc[taxon_number[0], 'Nextstrain_clade']

            
    #removing blanks and NAs for dates
    df.replace("", np.nan, inplace=True)
    df.dropna(subset = ["date"], inplace=True)

    #removing those for which we don't have north or south regional information for
    df = df[df['location'] != 'Other_King_County']
    
    
    #make sure to change the name for each respective variant build. Will automate soon.
    df.to_csv("../data/kc_clusters_"+name+"_new.tsv", sep = "\t")


  meta_df = pd.read_csv(metadata, sep ="\t", index_col = 'strain')
