In [4]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datetime import datetime as dt
from scipy.signal import lfilter
import pickle as pickle
import sys

In [6]:
#cncb meta, need to match countries
column_names = ['Virus Strain Name','Accession ID','Related ID','Nuc.Completeness','Sequence Quality','Host','Location','Sample Collection Date','Submitting Lab']
meta = pd.read_excel('../metadata_B_1_1_7.xlsx',usecols=column_names,index_col=1)
meta.fillna(' ',inplace=True)

#Remove low quality and partial reads, pretty sure cncb does not run variant annotation for these anyways
meta = meta[meta['Sequence Quality']!='Low']
meta = meta[meta['Nuc.Completeness']!='Partial']

#set country column and lowercase
meta['Country'] = meta['Location'].str.split('/').str[0].str.strip()
meta['Country'] = meta['Country'].str.lower()

#Adjust country typos
meta.loc[meta['Country']=='\u200eromania','Country'] = 'romania'
meta.loc[meta['Country']=='viet nam','Country'] = 'vietnam'
meta.loc[meta['Country']=='czech repubic','Country'] = 'czech republic'
meta.loc[meta['Country']=='ivory coast','Country'] = 'cotedivoire'

# #Remove Crimea and Palestine
meta = meta[meta['Country']!='crimea']
meta = meta[meta['Country']!='palestine']

print(meta.shape)

(6556, 9)


In [10]:
all_names = os.listdir('../gff3_cncb_restructured')
column_names = ['variant type','start','end', 'info']
missing = []
aggregated_mutations = {}

#for identifier in tqdm(meta[meta['Country']=='United States'].index):
for identifier in tqdm(meta.index):
    
    #Searching for correct identifier
    #--------------------------
    #No alternate name is ' '
    file_name = ''
    #Check if accession id in file names, if not check related ids
    if '2019-nCoV_'+identifier+'_variants.gff3' in all_names:
        file_name = '2019-nCoV_'+identifier+'_variants.gff3'
    # checking alternate names
    elif meta.loc[identifier,'Related ID'] != ' ':
        for alt_identifier in meta.loc[identifier,'Related ID'].replace(' ','').split(','):
            if '2019-nCoV_'+alt_identifier+'_variants.gff3' in all_names:
                file_name = '2019-nCoV_'+alt_identifier+'_variants.gff3'
                break
        #Added in case alternate names are also not found in gffs
        if file_name == '':
            missing.append(identifier)
            continue
    # If file name has not been updated, then there is no matching identifier, move to next index
    elif file_name == '':
        missing.append(identifier)
        continue
    #--------------------------
    
    #Filtering files with no variants
    #--------------------------
    with open(f'../gff3_cncb_restructured/{file_name}') as text_file:
        lines = text_file.readlines()
        counter = 0
        for l in lines:
            if '#' in l:
                counter += 1
    #Number of info lines should be less than total, if not then there are no mutations
    #--------------------------
    if counter<len(lines):
        
        gff = pd.read_csv(f'../gff3_cncb_restructured/{file_name}',sep='\t',skiprows=counter,usecols=[1,3,4,8],names=column_names)
        info_df = pd.DataFrame(gff['info'].str.split(';').values.tolist(),columns=[0,1,'Ref','Alt','Description']).drop([0,1],axis=1)
        gff = gff.drop(['info'],axis=1)
        gff['Country'] = [meta.loc[identifier,'Country']]*gff.shape[0]
        temp_df = pd.concat([gff,info_df],axis=1)
        
        #Filtering alternate amino acid and reference for missense_variant and synonymous_variant
        missenses_ref = temp_df.loc[temp_df['Description'].str.contains('missense_variant'),'Description'].str.split(',').str[1].str[-3]
        synonymous_ref = temp_df.loc[temp_df['Description'].str.contains('synonymous_variant'),'Description'].str.split(',').str[1].str[-1]
        temp_df['Ref_AA'] = pd.concat([missenses_ref,synonymous_ref])
        temp_df['Alt_AA'] = temp_df.loc[temp_df['Description'].str.contains('missense_variant'),'Description'].str.split(',').str[1].str[-1]

        missenses_str = temp_df.loc[temp_df['Description'].str.contains('missense_variant'),'Description'].str.split(',').str[1].str.split('.').str[-1]
        synonymous_str = temp_df.loc[temp_df['Description'].str.contains('synonymous_variant'),'Description'].str.split(',').str[1].str.split('.').str[-1]
        temp_df['AA'] = pd.concat([missenses_str,synonymous_str])
        
        temp_df.fillna('', inplace=True)
        temp_df['descriptor'] = temp_df['start'].astype(str)+','+temp_df['end'].astype(str)+','+temp_df['Ref']+','+temp_df['Alt']+\
        ','+temp_df['Description'].str.split(',').str[0].str.split('=').str[1]+','+temp_df['variant type']+','+temp_df['Ref_AA']+','+temp_df['Alt_AA']+\
        ','+temp_df['AA']
        
        for var in temp_df['descriptor']:
            if var not in aggregated_mutations.keys():
                aggregated_mutations[var] = [[],[],[]]
                
            #Below numbers should be the same for each mutation of same file
            #Country
            aggregated_mutations[var][0].append(temp_df.loc[0,'Country'])

print(len(missing))

100%|██████████| 6556/6556 [04:20<00:00, 25.19it/s]

0





In [13]:
unique_muts = pd.DataFrame(list(aggregated_mutations.keys()),columns=['descriptor'])
unique_muts = pd.DataFrame.join(unique_muts,pd.DataFrame(unique_muts['descriptor'].str.split(',').to_list())) #assign columns with parsed descriptor
unique_muts.set_index('descriptor',inplace=True)
unique_muts[0] = pd.to_numeric(unique_muts[0])
unique_muts[1] = pd.to_numeric(unique_muts[1])
unique_muts.sort_values([0,1],inplace=True)
unique_muts.columns = ['Start','End','Ref','Alt','VEP','Variant Type','Ref_AA','Alt_AA','AA']


counts = {}
num_countries = {}
counted_countries = {}

for desc in unique_muts.index:
    counts[desc] = len(aggregated_mutations[desc][0])
    num_countries[desc] = len(set(aggregated_mutations[desc][0]))
    counted_countries[desc] = dict(Counter(aggregated_mutations[desc][0]))
        
unique_muts['counts'] = unique_muts.index.to_series().map(counts)
unique_muts['countries'] = unique_muts.index.to_series().map(num_countries)
unique_muts['counted_countries'] = unique_muts.index.to_series().map(counted_countries)
unique_muts.index = unique_muts.index.str.split(',').str[0:4].str.join('_')
unique_muts.to_csv(f"B_1_1_7_variants.csv")