# hmpxv data preparation - geographic binning 
### This script takes in a metadata.tsv file and populates the 'location' column with regional tags.
The goal is to avoid dropping countries with only 1 or 2 hmpxv sequences by grouping them with nearby countries' sequences. 

In [1]:
import os
import pandas as pd

In [2]:
## import metadata.csv
metadata=pd.read_csv('/Users/nashwa/Desktop/git/hmpxv_dynamics/monkeypox-build/data/metadata.tsv', sep='\t')
meta=pd.DataFrame(metadata)

In [3]:
## define bins
## here, countries are grouped together regardless of sample count. 

dic = {'USA':"USA_CANADA",
       'Canada':"USA_CANADA",
       
       'United Kingdom':"UK_IRELAND",
       'Ireland':"UK_IRELAND",
       
       'Italy':"EUROPE_SOUTH",
       'Spain':"EUROPE_SOUTH",
       'Portugal':"EUROPE_SOUTH",
       
       'Finland':"EUROPE_NORTH",
       'Sweden':"EUROPE_NORTH",
       
       'France':"EUROPE_WEST",
       'Belgium':"EUROPE_WEST",
       'Netherlands':"EUROPE_WEST",
       'Germany':"EUROPE_WEST",
       'Switzerland':"EUROPE_WEST",
       
       'Austria':"EUROPE_EAST",
       'Slovakia':"EUROPE_EAST",
       'Czech Republic':"EUROPE_EAST",
       'Slovenia': "EUROPE_EAST",
       
       
       'Japan':"ASIA_EAST",
       'South Korea':"ASIA_EAST",
       'Taiwan':"ASIA_EAST",
       
       'Mexico':"SOUTH_AMERICA",
       'Peru':"SOUTH_AMERICA",
       'Colombia':"SOUTH_AMERICA",
       'Brazil':'SOUTH_AMERICA'}

In [5]:
## populate 'location' column with new values

for i in range(1,len(meta.index)):
    for key in dic.keys():
        if meta.loc[i,'country'] == key:
                meta.loc[i,'location'] = dic[key]

In [72]:
## export tsv
meta.to_csv('/Users/nashwa/Desktop/git/hmpxv_dynamics/monkeypox-build/data/geo_bin.tsv', sep='\t')

In [6]:
## new binning method
## here, countries with >100 sequences are left alone, while countries with fewer samples are grouped together.

big_meta=pd.DataFrame(metadata)

## define bins; bin stand alone countries as themselves
big_dic = {'USA':"USA",
       'Canada':"Canada",
       
       'United Kingdom':"UK_IRELAND",
       'Ireland':"UK_IRELAND", ## ireland is an exception, remains binned with UK samples
       
       'Italy':"EUROPE_SOUTH",
       'Spain':"EUROPE_SOUTH",
       'Portugal':"Portugal",
       
       'Finland':"EUROPE_NORTH",
       'Sweden':"EUROPE_NORTH",
       
       'France':"EUROPE_WEST",
       'Belgium':"EUROPE_WEST",
       'Netherlands':"EUROPE_WEST",
       'Germany':"Germany",
       'Switzerland':"EUROPE_WEST",
       
       'Austria':"EUROPE_EAST",
       'Slovakia':"EUROPE_EAST",
       'Czech Republic':"EUROPE_EAST",
       'Slovenia': "EUROPE_EAST",
       
       'Japan':"ASIA_EAST",
       'South Korea':"ASIA_EAST",
       'Taiwan':"ASIA_EAST",
       
       'Mexico':"SOUTH_AMERICA",
       'Peru':"SOUTH_AMERICA",
       'Colombia':"SOUTH_AMERICA",
       'Brazil':'SOUTH_AMERICA'}


## populate 'location' column with new values
for i in range(1,len(big_meta.index)):
    for key in big_dic.keys():
        if big_meta.loc[i,'country'] == key:
                big_meta.loc[i,'location'] = big_dic[key]

In [None]:
## export tsv
big_meta.to_csv('/Users/nashwa/Desktop/git/hmpxv_dynamics/monkeypox-build/data/big_geo_bin.tsv', sep='\t')

In [None]:
## in case to_csv causes issues with date parsing in BEAUTI -- e.g. '5/07/22' is read in Y/M/D format
## this reformats to Y-m-d

import datetime

months = ('4', '5', '6', '7', '8', '9', '10')

m = pd.DataFrame(big_meta)

for i in range(1, len(m.index)): 
    if type(m.loc[i,'date']) == str: 
        if m.loc[i, 'date'].startswith(months):
            date = m.loc[i, 'date']
            m.loc[i,'date'] = datetime.datetime.strptime(date, '%m/%d/%Y').strftime('%Y-%m-%d')

In [None]:
## export tsv
m.to_csv('/Users/nashwa/Desktop/git/hmpxv_dynamics/monkeypox-build/data/clean_dates.tsv', sep='\t')