# Signature mutations in patient samples

Notebook to assess the prevalence of signature mutation defining the lineages B.1.1.7 and 501.V2 in all 5758 non-B.1.1.7 and non-501.V2 consensus sequences from GISAID obtained from clinical samples collected in Switzerland before December 24. 

In [None]:
import pandas as pd

In [None]:
def count_UK_mutations(sequence):
    '''
    sequence: sequence as string
    return:   dict with counts of the signature mutations of the B.1.1.7 lineage 
              and list of co-occurring mutations. 
    '''
    
    C3267T =0
    C5388A =0
    T6954C =0
    del_11288_11296=0
    del_21765_21770=0
    del_21991_21993=0
    A23063T=0
    C23271A=0
    C23604A=0
    C23709T=0
    T24506G=0
    G24914C=0
    C27972T=0
    G28048T=0
    A28111G=0
    C28977T=0
    GAT28280CTA=0
        
    if sequence[3266] in {"t",'T'}:
        C3267T+=1
    if sequence[5387] in {"a","A"}:
        C5388A+=1
    if sequence[6953]in {"c", "C"}:
        T6954C+=1
    if sequence[11287:11295]=='--------':
        del_11288_11296+=1
    if sequence[21764:21769]=='-----':
        del_21765_21770+=1
    if sequence[21990:21992]=='--':
        del_21991_21993+=1
    if sequence[23062] in {"t",'T'}:
        A23063T+=1
    if sequence[23270] in {"a", 'A'}:
        C23271A+=1
    if sequence[23603] in {"a","A"}:
        C23604A+=1
    if sequence[23708] in {"t", "T"}:
        C23709T+=1
    if sequence[24505] in {"g", 'G'}:
        T24506G+=1
    if sequence[24913] in {"c", 'C'}:
        G24914C+=1
    if sequence[27971] in {"t", "T"}:
        C27972T+=1
    if sequence[28047] in {"t", 'T'}:
        G28048T+=1
    if sequence[28110] in {"g",'G'}:
        A28111G+=1
    if sequence[28976] in {"t", 'T'}:
        C28977T+=1
    if sequence[28279:28282] in {"cta", 'CTA'}:
        GAT28280CTA+=1
    
        
    #sum of mutations found in current consensus sequence 
    sum =C3267T+C5388A+T6954C+del_11288_11296+del_21765_21770+del_21991_21993+A23063T+C23271A+C23604A+C23709T+T24506G+G24914C+C27972T+G28048T+A28111G+C28977T+GAT28280CTA
    
    # check for co-occurence of mutations
    UK_mut_list = [('C3267T', C3267T),('C5388A', C5388A),('T6954C', T6954C),('del_11288_11296', del_11288_11296),
                   ('del_21765_21770', del_21765_21770),('del_21991_21993', del_21765_21770),('A23063T', A23063T),
                   ('C23271A', C23271A),('C23604A', C23604A),('C23709T', C23709T), ('T24506G', T24506G), 
                   ('G24914C', G24914C), ('C27972T', C27972T), ('G28048T', G28048T), ('A28111G', A28111G), 
                   ('C28977T', C28977T), ('GAT28280CTA', GAT28280CTA)]
    list = []
    if sum>1:
        for mut in UK_mut_list:
            if mut[1]>0:
                list.append(mut[0])
        
    
    # write information about current consensus sequence (sample) to the new dataframe
    dict={'sum': sum,
          'co_occ': list,
          'C3267T':C3267T,
          'C5388A': C5388A,
          'T6954C': T6954C, 
          'del_11288_11296': del_11288_11296,
          'del_21765_21770': del_21765_21770,
          'del_21991_21993': del_21991_21993, 
          'A23063T': A23063T, 
          'C23271A': C23271A,
          'C23604A': C23604A, 
          'C23709T': C23709T,
          'T24506G': T24506G,
          'G24914C': G24914C,
          'C27972T': C27972T,
          'G28048T': G28048T,
          'A28111G': A28111G,
          'C28977T': C28977T,
          'GAT28280CTA': GAT28280CTA}
    return dict
    

In [None]:
def count_SA_mutations(sequence):
    '''
    sequence: sequence as string
    return:   dict with counts of the signature mutations of the 501.V2 lineage 
              and list of co-occurring mutations. 
    '''
    C1059T =0
    G5230T=0
    A10323G=0
    A21801C=0
    G22813T=0
    G23012A=0
    A23063T=0
    C23664T=0
    G25563T=0
    C25904T=0
    C26456T=0
    C28887T=0
    
    if sequence[1058] in {"t", 'T'}:
        C1059T+=1
    if sequence[5229] in {"t", 'T'}:
        G5230T+=1
    if sequence[10322]in {'g', 'G'}:
        A10323G+=1
    if sequence[21800] in {"c", 'C'}:
        A21801C+=1
    if sequence[22812] in {"t", 'T'}:
        G22813T+=1
    if sequence[23011] in {"a", 'A'}:
        G23012A+=1
    if sequence[23062] in {"t", 'T'}:
        A23063T+=1
    if sequence[23663] in {"t", 'T'}:
        C23664T+=1
    if sequence[25562] in {"t", 'T'}:
        G25563T+=1
    if sequence[25903] in {"t",'T'}:
        C25904T+=1
    if sequence[26455] in {"t",'T'}:
        C26456T+=1
    if sequence[28886] in {"t",'T'}:
        C28887T+=1

    
    sum=C1059T+G5230T+A10323G+A21801C+G22813T+G23012A+A23063T+C23664T+G25563T+C25904T+C26456T+C28887T
    
    # check for co-occurence of mutations
    SA_mut_list = [('C1059T', C1059T),('G5230T', G5230T),('A10323G', A10323G),('A21801C', A21801C),
                   ('G22813T', G22813T),('G23012A', G23012A),('A23063T', A23063T),
                   ('C23664T', C23664T),('G25563T', G25563T),('C25904T', C25904T), ('C26456T', C26456T), 
                   ('C28887T', C28887T)]
    list = []
    if sum>1:
        for mut in SA_mut_list:
            if mut[1]>0:
                list.append(mut[0])
    
    dict={'sum': sum,
          'co_occ': str(list),
          'C1059T':C1059T,
          'G5230T':G5230T,
          'A10323G':A10323G,
          'A21801C':A21801C,
          'G22813T':G22813T,
          'G23012A':G23012A,
          'A23063T':A23063T,
          'C23664T':C23664T,
          'G25563T':G25563T,
          'C25904T':C25904T,
          'C26456T':C26456T,
          'C28887T':C28887T
           }
    
    return dict 

### Collect Swiss patient samples from sars_cov_2-ETHZ-database 

In [None]:
import psycopg2
db_host = 'id-hdb-psgr-cp61.ethz.ch'
db_name = 'sars_cov_2'
db_user = input("Enter username for database" + db_name + ":\n")
db_password = input("Enter password for user " + db_user + ":\n")
# Connect to database
db_connection = "dbname=\'" + db_name + \
                "\' user=\'" + db_user + \
                "\' host=\'" + db_host + \
                "\' password=\'" + db_password + "\'"
try:
  conn = psycopg2.connect(db_connection)
except Exception as e:
  raise Exception("I am unable to connect to the database.", e)
cursor = conn.cursor()
cursor.execute("select gs.strain, gs.date, gs.date_str, gs.division, gs.pangolin_lineage, gs.originating_lab, gs.submitting_lab, gs.date_submitted, gs.aligned_seq from gisaid_sequence gs where gs.country ='Switzerland'")
data = cursor.fetchall()
cursor.close()

conn.close()

### Summarize data in dataframe df

In [None]:
df = pd.DataFrame(data,columns=('strain', 'date', 'date_str', 'division', 'pangolin_lineage', 
                                     'originating_lab', 'submitting_lab', 'date_submitted', 'aligned_seq'))

### Exclude sequences that are already identified as UK- or SA-variant from the analysis

In [None]:
df = df[df['pangolin_lineage']!='B.1.1.7'] #UK variants 
df = df[df['pangolin_lineage']!='501.V2'] #SA variants

### Analysis is only performed with sequences with sample date before before 24 Dec 2020

In [None]:
df['date_datetime'] = pd.to_datetime(df['date'])
df = df[df['date_datetime']<='2020-12-24']

### Number of samples analysed 

In [None]:
df.shape[0]

### Prevalence of the signature mutations of the B.1.1.7 lineage summarized in df_UK

In [None]:
# define the dataframe where we count the observed mutations
df_UK = pd.DataFrame(columns=('id','division','date', 'sum','co_occ','n_sample','C3267T', 'C5388A','T6954C', 
                                  'del_11288_11296', "del_21765_21770",'del_21991_21993', 'A23063T',
                                  'C23271A','C23604A', 'C23709T','T24506G','G24914C','C27972T',
                                  'G28048T', 'A28111G','C28977T','GAT28280CTA', 'A28095T'))

# go through each consensus sequence of the clinical samples and check if the mutations are found
for index, row in df.iterrows():
    info_dict = {'id':row['strain'],'division': row['division'], 'date':row['date'], 'n_sample':1}
    mut_dict = count_UK_mutations(row['aligned_seq'])
    info_dict.update(mut_dict)
    df_UK = df_UK.append(info_dict, ignore_index=True)

In [None]:
# Sequences with at least one signature mutation
df_UK[df_UK['sum']>0]

### Prevalence of the UK signature mutations before and after 2020-10-23

In [None]:
df_UK_freq = pd.DataFrame(columns=('mutation', 'abs_freq', 'relativ_freq'))

cut_off_date = '2020-10-23'

df_UK['date_datetime'] = pd.to_datetime(df_UK['date'])
df_UK_before_oct = df_UK[df_UK['date_datetime']<cut_off_date]
df_UK_after_oct = df_UK[df_UK['date_datetime']>=cut_off_date]

df_UK_freq_before_oct = pd.DataFrame(columns=('mutation', 'abs_freq', 'relativ_freq'))
df_UK_freq_after_oct = pd.DataFrame(columns=('mutation', 'abs_freq', 'relativ_freq'))


UK_mut = ['C3267T', 'C5388A','T6954C','del_11288_11296', "del_21765_21770",'del_21991_21993', 'A23063T','C23271A',
          'C23604A', 'C23709T','T24506G','G24914C','C27972T','G28048T', 'A28111G','C28977T','GAT28280CTA', 
          'A28095T']

for mut in UK_mut:
    df_UK_freq =  df_UK_freq.append({'mutation': mut, 
                                     "abs_freq": df_UK[mut].sum(), 
                                     'relativ_freq': df_UK[mut].sum()/df_UK.shape[0]*100 
                                    },ignore_index=True)
    df_UK_freq_before_oct =  df_UK_freq_before_oct.append({'mutation': mut, 
                                                           "abs_freq": df_UK_before_oct[mut].sum(), 
                                                           'relativ_freq': df_UK_before_oct[mut].sum()/df_UK_before_oct.shape[0]*100 
                                                          },ignore_index=True)
    df_UK_freq_after_oct =  df_UK_freq_after_oct.append({'mutation': mut, 
                                                         "abs_freq": df_UK_after_oct[mut].sum(), 
                                                         'relativ_freq': df_UK_after_oct[mut].sum()/df_UK_after_oct.shape[0]*100 
                                                        },ignore_index=True)
    
#df_UK_freq.to_csv('df_UK_freq.csv')
#df_UK_freq_before_oct.to_csv('df_UK_freq_before_oct.csv')
#df_UK_freq_after_oct.to_csv('df_UK_freq_after_oct.csv')

### Prevalence of the signature mutations of the 501.V2 lineage summarized in df_SA

In [None]:
df_SA = pd.DataFrame(columns=('id','division','date', 'sum','co_occ','n_sample','C1059T','G5230T','A10323G','A21801C',
                              'G22813T','G23012A','A23063T','C23664T','G25563T','C25904T','C26456T','C28887T'))


# go through each consensus sequence of the clinical samples and check if the mutations are found
for index, row in df.iterrows():
    info_dict = {'id':row['strain'],'division': row['division'], 'date':row['date'], 'n_sample':1}
    mut_dict = count_SA_mutations(row['aligned_seq'])
    info_dict.update(mut_dict)
    df_SA = df_SA.append(info_dict, ignore_index=True)

In [None]:
# Sequences with at least one signature mutation
df_SA[df_SA['sum']>0]

In [None]:
df_SA_freq = pd.DataFrame(columns=('mutation', 'abs_freq', 'relativ_freq'))

cut_off_date = '2020-10-23'

df_SA['date_datetime'] = pd.to_datetime(df_SA['date'])
df_SA_before_oct = df_SA[df_SA['date_datetime']<cut_off_date]
df_SA_after_oct = df_SA[df_SA['date_datetime']>=cut_off_date]

df_SA_freq_before_oct = pd.DataFrame(columns=('mutation', 'abs_freq', 'relativ_freq'))
df_SA_freq_after_oct = pd.DataFrame(columns=('mutation', 'abs_freq', 'relativ_freq'))


SA_mut = ['C1059T','G5230T','A10323G','A21801C','G22813T','G23012A','A23063T','C23664T','G25563T',
          'C25904T','C26456T','C28887T']

for mut in SA_mut:
    df_SA_freq =  df_SA_freq.append({'mutation': mut, 
                                     "abs_freq": df_SA[mut].sum(), 
                                     'relativ_freq': df_SA[mut].sum()/df_SA.shape[0]*100 
                                    },ignore_index=True)
    df_SA_freq_before_oct =  df_SA_freq_before_oct.append({'mutation': mut, 
                                                           "abs_freq": df_SA_before_oct[mut].sum(), 
                                                           'relativ_freq': df_SA_before_oct[mut].sum()/df_SA_before_oct.shape[0]*100 
                                                          },ignore_index=True)
    df_SA_freq_after_oct =  df_SA_freq_after_oct.append({'mutation': mut, 
                                                         "abs_freq": df_SA_after_oct[mut].sum(), 
                                                         'relativ_freq': df_SA_after_oct[mut].sum()/df_SA_after_oct.shape[0]*100 
                                                        },ignore_index=True)
    
#df_SA_freq.to_csv('df_SA_freq.csv')
#df_SA_freq_before_oct.to_csv('df_SA_freq_before_oct.csv')
#df_SA_freq_after_oct.to_csv('df_SA_freq_after_oct.csv')

### Grouping the co-occuring mutations together

In [None]:
df_SA_co_occ = df_SA[df_SA['sum']>1].drop(['C1059T','G5230T','A10323G','A21801C','G22813T','G23012A','A23063T',
                           'C23664T','G25563T','C25904T','C26456T','C28887T','id','sum'], axis=1)
df_SA_co_occ = df_SA_co_occ.groupby('co_occ').sum()
df_SA_co_occ