Hitchhiking mutations refer to genetic variations that become more prevalent in a population due to their close physical linkage to a specific beneficial mutation. In essence, these mutations "hitch a ride" with the advantageous mutation during the process of natural selection, leading to their increased frequency in the population over time.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np 
import matplotlib.pylab as plt

In [63]:
fname = "crpv.cooccurring_mutations.csv"
df = pd.read_csv(fname)
df['mutation'] = df['ref'] + df['position'].astype(str).str.split(".0").str[0] + df['var'] 

df = df[['haplotype_id', 'start',
       'position', 'ref', 'var', 'reads', 'support', 'coverage', 'passage', 'file', 'mutation']]

In [64]:
mutations_of_interest = ['G1323A', 'G1363A']

df = df[df['mutation'].isin(mutations_of_interest)]

# only in sample K-10-a it looks like it could be hitch hiker mutations
df = df[df['passage'].isin(["K_p10_a", "K_p5_a", "K_p3_a", "K_p2_a", "K_p1_a"])]

In [65]:
# check which window is even covering both of them
# window 1315-1368: both
# window 1333-1386: only 1363
# window 1351-1404: only 1363
# window 1297-1350: only 1323
# window 1279-1332: only 1323
# window 1279-1332: only 1323 

df['haplotype_id'].unique()

array(['haplotype0-1315-1368', 'haplotype0-1333-1386',
       'haplotype0-1297-1350', 'haplotype2-1279-1332',
       'haplotype2-1315-1368', 'haplotype3-1315-1368',
       'haplotype5-1315-1368', 'haplotype1-1333-1386',
       'haplotype2-1333-1386', 'haplotype6-1333-1386',
       'haplotype0-1351-1404', 'haplotype2-1351-1404',
       'haplotype3-1351-1404', 'haplotype5-1351-1404',
       'haplotype2-1297-1350', 'haplotype1-1315-1368',
       'haplotype4-1315-1368', 'haplotype6-1315-1368',
       'haplotype3-1333-1386', 'haplotype5-1333-1386',
       'haplotype7-1333-1386', 'haplotype1-1351-1404',
       'haplotype4-1351-1404', 'haplotype1-1297-1350',
       'haplotype0-1279-1332', 'haplotype1-1279-1332',
       'haplotype4-1279-1332', 'haplotype5-1279-1332',
       'haplotype6-1279-1332'], dtype=object)

In [66]:
# filter for start position such that both mutations are covered
df = df[df['start']==1315]

In [67]:
df.sort_values(['passage', 'haplotype_id'])[['passage', 
                                             'haplotype_id', 
                                             'mutation', 
                                             'reads', 
                                             'coverage', 
                                             'support']].head(50)

Unnamed: 0,passage,haplotype_id,mutation,reads,coverage,support
53721,K_p10_a,haplotype0-1315-1368,G1323A,1.0,4803.0,0.699822
53724,K_p10_a,haplotype1-1315-1368,G1323A,3.0,4803.0,0.999885
53726,K_p10_a,haplotype1-1315-1368,G1363A,3.0,4803.0,0.999885
53727,K_p10_a,haplotype2-1315-1368,G1323A,4756.0,4803.0,1.0
53728,K_p10_a,haplotype2-1315-1368,G1363A,4756.0,4803.0,1.0
53729,K_p10_a,haplotype3-1315-1368,G1323A,2.0,4803.0,0.829556
53731,K_p10_a,haplotype3-1315-1368,G1363A,2.0,4803.0,0.829556
53732,K_p10_a,haplotype4-1315-1368,G1323A,5.0,4803.0,1.0
53734,K_p10_a,haplotype4-1315-1368,G1363A,5.0,4803.0,1.0
53735,K_p10_a,haplotype5-1315-1368,G1323A,26.0,4803.0,1.0


## get extra info about those mutations



In [68]:
df_anno = pd.read_csv("all_mutations.annotated.filtered.csv")

In [71]:
df_anno['mutation'] = df_anno['REF'] + df_anno['POS'].astype(str)+ df_anno['ALT'] 

In [72]:
df_anno = df_anno[df_anno['mutation'].isin(mutations_of_interest)]

In [75]:
df_anno = df_anno[['RefCodon', 'AltCodon', 'RefAminoAcid',
       'AltAminoAcid', 'CodonPosition', 'SNPCodonPosition', 'AminoAcidChange',
       'IsSynonymous', 'Product', 'ProteinID', 'VariantType', 'FeatureType',
        'mutation']]

In [76]:
df_anno

Unnamed: 0,RefCodon,AltCodon,RefAminoAcid,AltAminoAcid,CodonPosition,SNPCodonPosition,AminoAcidChange,IsSynonymous,Product,ProteinID,VariantType,FeatureType,mutation
21392,CAG,CAA,Q,Q,205,2,Q205Q,1,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1323A
21394,GAT,AAT,D,N,219,0,D219N,0,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1363A
23487,GAT,AAT,D,N,219,0,D219N,0,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1363A
24255,CAG,CAA,Q,Q,205,2,Q205Q,1,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1323A
24258,GAT,AAT,D,N,219,0,D219N,0,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1363A
25035,CAG,CAA,Q,Q,205,2,Q205Q,1,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1323A
25038,GAT,AAT,D,N,219,0,D219N,0,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1363A
25763,CAG,CAA,Q,Q,205,2,Q205Q,1,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1323A
25766,GAT,AAT,D,N,219,0,D219N,0,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1363A
25978,CAG,CAA,Q,Q,205,2,Q205Q,1,nonstructural[space]polyprotein,NP_647481.1,SNP,CDS,G1323A
