## working on subsampling based on case counts

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import random
import altair as alt

In [2]:
#read in seq data
seqs = pd.read_csv("../monkeypox-build/data/metadata_region_3000_all.tsv", sep = "\t")
raw_cases = pd.read_csv("../case-rt-analysis/data/monkeypox-cases-counts.tsv", sep="\t")
raw_cases = raw_cases[raw_cases.location != "World"]
raw_cases.location[raw_cases["location"] == "United States"] = "USA"
#raw_cases.location[raw_cases["location"] == "United Kingdom"] = "UnitedKingdom"

In [3]:
seqs

Unnamed: 0.1,Unnamed: 0,accession,date,region_coarse,country,strain
0,0,ON838940,2022-05-20,Southern Europe,Spain,4061
1,1,ON838939,2022-06-06,Southern Europe,Spain,8887
2,2,OP413713,2022-07-25,Western Europe,United Kingdom,9000231
3,3,OP413714,2022-08-01,Western Europe,United Kingdom,9000289
4,4,OP413715,2022-08-17,Western Europe,United Kingdom,9000327
...,...,...,...,...,...,...
3008,3008,OP440544,2022-07-25,North America,USA,hMPX/Human/USA/CA-LACPHL-MA00053/2022
3009,3009,OP722496,2022-07-20,North America,USA,MPX/Human/USA/CA-LACPHL-MA00237/2022
3010,3010,OP539929,2022-08-30,North America,USA,MPX/Human/USA/CA-LACPHL-MA00144/2022
3011,3011,OQ330985,2022-10-12,North America,USA,Monkeypox virus/Human/USA/CA-LACPHL-MA00379/2022


In [4]:
seqs.date = pd.to_datetime(seqs.date)
raw_cases.date = pd.to_datetime(raw_cases.date)

In [5]:
seqs["year_month"] = seqs.date.dt.strftime('%Y-%m')
raw_cases["year_month"] = raw_cases.date.dt.strftime('%Y-%m')


In [6]:
seqs.country.value_counts()

USA               1220
Germany            646
Portugal           501
United Kingdom     286
Canada             140
Colombia            89
Slovenia            34
France              25
Belgium             18
Slovakia            11
Austria             11
Peru                10
Spain                9
Switzerland          7
Italy                6
Name: country, dtype: int64

In [7]:
unique_dates = seqs.year_month.unique().tolist()

In [8]:
unique_dates.remove("2021-11") 
unique_dates.remove("2022-03")
#raw_cases

In [9]:
unique_dates

['2022-05',
 '2022-06',
 '2022-07',
 '2022-08',
 '2022-09',
 '2022-10',
 '2022-11',
 '2022-12',
 '2023-01']

In [10]:
monthly_cases = raw_cases.groupby(["year_month", "location"])["cases"].sum().reset_index()

In [11]:
unique_locations = monthly_cases.location.unique().tolist()

In [12]:
unique_locations

['Austria',
 'Belgium',
 'France',
 'Germany',
 'Italy',
 'Portugal',
 'Slovenia',
 'Spain',
 'Switzerland',
 'United Kingdom',
 'Canada',
 'Colombia',
 'Peru',
 'USA',
 'Slovakia']

In [13]:
temp_seqs = pd.DataFrame()
temp_cases = pd.DataFrame()
final_df = pd.DataFrame(columns = ["accession", "date", "region_coarse", "country", "strain", "year_month"])
for item in unique_dates: 
    temp_seqs = seqs[seqs.year_month == item]
    temp_cases = monthly_cases[monthly_cases.year_month == item]
    sampled_countries = random.choices(population = temp_cases.location.unique().tolist(), weights = temp_cases.cases, k = int(950/len(monthly_cases.year_month.unique().tolist())))
    #print(sampled_countries)
    for country in sampled_countries:
        try:
            subset_seqs = pd.DataFrame()
            subset_seqs = temp_seqs[temp_seqs.country == country].sample(1)
            temp_seqs.drop(subset_seqs.index, inplace = True)
            #print(subset_seqs)
            final_df = final_df.append(subset_seqs, ignore_index=True)
        except ValueError:
            continue 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_seqs.drop(subset_seqs.index, inplace = True)
  final_df = final_df.append(subset_seqs, ignore_index=True)


In [14]:
final_df.region_coarse.value_counts()

North America      290
Western Europe     131
Southern Europe     52
South America       43
Central Europe       1
Name: region_coarse, dtype: int64

In [15]:
#Adding back in the earliest seqs
final_df =  final_df.append(seqs[seqs.year_month == "2022-03"])

  final_df =  final_df.append(seqs[seqs.year_month == "2022-03"])


In [16]:
region_count = final_df.groupby(["year_month"])["country"].value_counts().reset_index(name="count")

In [17]:
colors = {
    "Austria": "#B3C8BC",
    "Belgium": "#438B62",
    "Canada": "#133253",
    "Colombia": "#2D1E45",
    "France": "#CCDAD2",
    "Germany": "#356D4C",
    "Italy": "#D07954",
    "Peru": "#695B82",
    "Portugal": "#EEA160",
    "Slovakia": "#D0A854",
    "Slovenia": "#EEC060",
    "Spain": "#F4C49C",
    "Switzerland": "#81A48F",
    "United Kingdom": "#1C3828",
    "USA": "#2664A5"
}


In [19]:


region_count['order'] = region_count['country'].replace(
    {val: i for i, val in enumerate(colors.keys())}
)

alt.Chart(region_count).mark_bar(size = 70).encode(
    alt.X("year_month:T",axis=alt.Axis(title = "", grid=False, )),
    alt.Y("count",axis=alt.Axis( grid=False, offset = 30)), 
    alt.Color("country:N", scale=alt.Scale(domain=list(colors.keys()), range=list(colors.values())), legend=alt.Legend(title = "Country", orient = "right",offset = -125, labelFontSize = 14, titleFontSize = 14, symbolSize = 110)),
    alt.Order("order", sort= "descending")).properties(
    width = 800,
    height = 600
).configure_view(
    strokeWidth=0)

In [235]:
final_df["accession"].tolist()

['OP414372',
 'ON694341',
 'ON950045',
 'ON619835',
 'ON682267',
 'OP205139',
 'ON782055',
 'ON745225',
 'ON745215',
 'ON619838',
 'ON694335',
 'ON682263',
 'ON780017',
 'OP414375',
 'ON808416',
 'OP022171',
 'ON843163',
 'ON843169',
 'ON619837',
 'ON808413',
 'ON649712',
 'ON782054',
 'OP120937',
 'ON649709',
 'ON585036',
 'ON619836',
 'OP394229',
 'OP205124',
 'ON755040',
 'ON622718',
 'OP205114',
 'ON614676',
 'ON808417',
 'OP022170',
 'ON585035',
 'ON838940',
 'ON649720',
 'OP205084',
 'ON843173',
 'ON808414',
 'ON585032',
 'ON755039',
 'OP205083',
 'ON808415',
 'OP205067',
 'ON622722',
 'OP414363',
 'ON843167',
 'OP414362',
 'ON780016',
 'ON694336',
 'OP414360',
 'ON622712',
 'ON649711',
 'OP414377',
 'ON843172',
 'ON682265',
 'ON843165',
 'ON682268',
 'OQ274887',
 'ON585037',
 'OP414367',
 'OP205110',
 'OP279043',
 'OP205081',
 'OP580156',
 'OP382478',
 'OP205091',
 'OP715818',
 'OP390189',
 'OP838893',
 'ON880538',
 'OP414324',
 'ON853680',
 'OP324514',
 'OP390188',
 'OP324524',

In [236]:
final_df.accession.to_csv("../monkeypox-build/data/include_list_500.tsv", sep = "\t", header = False, index = False)