## script for combining each variant build fasta with aligned sequences and metadata with cluster assignments into two seperate combined fasta and metadata files. This is the last step before feeding into MASCOT GLM

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
alpha_clus = "../data/kc_clusters_alpha_new.tsv"
delta_clus = "../data/kc_clusters_delta_new.tsv"
omicron_clus = "../data/kc_clusters_omicron_new.tsv"
other_clus = "../data/kc_clusters_other_new.tsv"

alpha_fasta = "../data/kc_clusters_alpha_new.fasta"
delta_fasta = "../data/kc_clusters_delta_new.fasta"
omicron_fasta = "../data/kc_clusters_omicron_new.fasta"
other_fasta = "../data/kc_clusters_other_new.fasta"

First we'll focus on combining the cluster files as updating the cluster numbers to continue in numberical order 

In [3]:
alphadf = pd.read_csv(alpha_clus, sep ="\t")
deltadf = pd.read_csv(delta_clus, sep ="\t")
omicrondf = pd.read_csv(omicron_clus, sep ="\t")
otherdf = pd.read_csv(other_clus, sep ="\t")


In [4]:
#this makes a list of all unique clusters and keeps running count in order to stitch everything into a single list
#with continuous numbering
df_list = [alphadf, deltadf, omicrondf, otherdf]
unique_count = 0
for df in df_list:
    df['is_unique'] = ~df['cluster'].duplicated()
    print( df.is_unique.value_counts())
    print( df.is_unique.value_counts()[1])
    unique_count = unique_count + df.is_unique.value_counts()[1]
        

False    1738
True     1133
Name: is_unique, dtype: int64
1133
True     1754
False    1167
Name: is_unique, dtype: int64
1754
True     1689
False    1122
Name: is_unique, dtype: int64
1689
True     1388
False    1388
Name: is_unique, dtype: int64
1388


In [5]:
unique_list = ([i +1 for i in range(unique_count)])
#concantinates each variant list all together
result = pd.concat(df_list)

In [6]:
#in the new df, identify again which cluster is unique and then assign it a number from the single list with 
#continuous numbering
unique_column = np.array(result.is_unique).astype(int)
unique_column[unique_column==1] = unique_list
result["new_clusters"] = unique_column

In [7]:
#for every other member of a cluster other than the first one, we first make it a NA and then forward fill with the 
#new cluster number in the continuous list.
result.new_clusters[result.new_clusters == 0 ] = np.nan
result.loc[:,'new_clusters'] = result.loc[:,'new_clusters'].ffill()
result['new_clusters'] = result.new_clusters.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result.new_clusters[result.new_clusters == 0 ] = np.nan


In [8]:
result

Unnamed: 0,strain,cluster,location,date,variant,is_unique,new_clusters
0,USA/WA-UW-21050529358/2021,1,North_King_County,2021-05-05,alpha,True,1
1,USA/WA-UW-21050476716/2021,2,North_King_County,2021-05-04,alpha,True,2
2,USA/WA-UW-2021041397284/2021,3,North_King_County,2021-04-13,alpha,True,3
3,USA/WA-UW-2021041489375/2021,3,North_King_County,2021-04-14,alpha,False,3
4,USA/WA-Altius-ALTCOV-NBDJ5QXGCUU6C2F6/2021,3,South_King_County,2021-04-27,alpha,False,3
...,...,...,...,...,...,...,...
2771,USA/WA-CDC-UW22030161746/2022,1423,North_King_County,2022-03-01,other,True,5962
2772,USA/WA-CDC-UW22030214877/2022,1424,North_King_County,2022-03-02,other,True,5963
2773,USA/WA-CDC-UW22030572755/2022,1424,North_King_County,2022-03-05,other,False,5963
2774,USA/WA-CDC-UW22022865732/2022,1425,North_King_County,2022-02-28,other,True,5964


In [9]:
result.to_csv('../data/kc_clusters_combined_new.tsv',index = False , sep="\t")

now we focus on combining all the fasta files together into one file. 

In [None]:
##adapted from : https://www.biostars.org/p/270186/

direct = '../data/fasta/'

combined = open('../data/kc_clusters_combined_new.fasta', 'w')
for f in os.listdir(direct):
    fh = open(os.path.join(direct, f))
    for line in fh:
        combined.write(line)
    fh.close()
combined.close()