## script for combining each variant build fasta with aligned sequences and metadata with cluster assignments into two seperate combined fasta and metadata files. This is the last step before feeding into MASCOT GLM

In [7]:
import pandas as pd
import numpy as np
import os

In [8]:
alpha_clus = "../data/kc_clusters_alpha_new.tsv"
delta_clus = "../data/kc_clusters_delta_new.tsv"
omicron_clus = "../data/kc_clusters_omicron_new.tsv"
other_clus = "../data/kc_clusters_other_new.tsv"

alpha_fasta = "../data/kc_clusters_alpha_new.fasta"
delta_fasta = "../data/kc_clusters_delta_new.fasta"
omicron_fasta = "../data/kc_clusters_omicron_new.fasta"
other_fasta = "../data/kc_clusters_other_new.fasta"

First we'll focus on combining the cluster files as updating the cluster numbers to continue in numberical order 

In [9]:
alphadf = pd.read_csv(alpha_clus, sep ="\t")
deltadf = pd.read_csv(delta_clus, sep ="\t")
omicrondf = pd.read_csv(omicron_clus, sep ="\t")
otherdf = pd.read_csv(other_clus, sep ="\t")


In [10]:
alphadf

Unnamed: 0,strain,cluster,location,date,variant
0,USA/WA-UW-58813/2021,1,North_King_County,2021-01-29,alpha
1,USA/WA-UW-67093/2021,2,South_King_County,2021-03-15,alpha
2,USA/WA-UW-67915/2021,3,South_King_County,2021-03-16,alpha
3,USA/WA-S6887/2021,4,South_King_County,2021-04-06,alpha
4,USA/WA-CDC-UW21051428325/2021,5,South_King_County,2021-05-14,alpha
...,...,...,...,...,...
2790,USA/WA-UW-2021040843617/2021,1130,North_King_County,2021-04-08,alpha
2791,USA/WA-UW-21071214811/2021,1131,South_King_County,2021-07-12,alpha
2792,USA/WA-S10320/2021,1131,South_King_County,2021-07-08,alpha
2793,USA/WA-CDC-UW21062453012/2021,1131,South_King_County,2021-06-24,alpha


In [11]:
#this makes a list of all unique clusters and keeps running count in order to stitch everything into a single list
#with continuous numbering
df_list = [alphadf, deltadf, omicrondf, otherdf]
unique_count = 0
for df in df_list:
    df['is_unique'] = ~df['cluster'].duplicated()
    print( df.is_unique.value_counts())
    print( df.is_unique.value_counts()[1])
    unique_count = unique_count + df.is_unique.value_counts()[1]
        

False    1699
True     1096
Name: is_unique, dtype: int64
1096
True     1668
False    1101
Name: is_unique, dtype: int64
1668
True     1704
False     829
Name: is_unique, dtype: int64
1704
True     1369
False    1242
Name: is_unique, dtype: int64
1369


In [12]:
unique_list = ([i +1 for i in range(unique_count)])
#concantinates each variant list all together
result = pd.concat(df_list)

In [13]:
#in the new df, identify again which cluster is unique and then assign it a number from the single list with 
#continuous numbering
unique_column = np.array(result.is_unique).astype(int)
unique_column[unique_column==1] = unique_list
result["new_clusters"] = unique_column

In [14]:
#for every other member of a cluster other than the first one, we first make it a NA and then forward fill with the 
#new cluster number in the continuous list.
result.new_clusters[result.new_clusters == 0 ] = np.nan
result.loc[:,'new_clusters'] = result.loc[:,'new_clusters'].ffill()
result['new_clusters'] = result.new_clusters.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result.new_clusters[result.new_clusters == 0 ] = np.nan


In [15]:
result

Unnamed: 0,strain,cluster,location,date,variant,is_unique,new_clusters
0,USA/WA-UW-58813/2021,1,North_King_County,2021-01-29,alpha,True,1
1,USA/WA-UW-67093/2021,2,South_King_County,2021-03-15,alpha,True,2
2,USA/WA-UW-67915/2021,3,South_King_County,2021-03-16,alpha,True,3
3,USA/WA-S6887/2021,4,South_King_County,2021-04-06,alpha,True,4
4,USA/WA-CDC-UW21051428325/2021,5,South_King_County,2021-05-14,alpha,True,5
...,...,...,...,...,...,...,...
2606,USA/WA-CDC-UW21051960527/2021,1453,North_King_County,2021-05-19,other,True,5834
2607,USA/WA-S8491/2021,1454,South_King_County,2021-05-22,other,True,5835
2608,USA/WA-CDC-UW21050739083/2021,1455,South_King_County,2021-05-07,other,True,5836
2609,USA/WA-CDC-UW21050633664/2021,1456,North_King_County,2021-05-06,other,True,5837


In [16]:
result.to_csv('../data/kc_clusters_combined_new.tsv',index = False , sep="\t")

now we focus on combining all the fasta files together into one file. 

In [None]:
##adapted from : https://www.biostars.org/p/270186/

direct = '../data/fasta/'

combined = open('../data/kc_clusters_combined_new.fasta', 'w')
for f in os.listdir(direct):
    fh = open(os.path.join(direct, f))
    for line in fh:
        combined.write(line)
    fh.close()
combined.close()