In [1]:
import argparse
from Bio import SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopy.distance
from scipy import stats
import os
import altair as alt
import altair_saver

In [2]:
alignment = '/Users/miguelparedes/Desktop/gitrepos/kc_puma_build/results/north-america_usa_washington_king-county/aligned.fasta'
metadata = '/Users/miguelparedes/Desktop/gitrepos/kc_puma_build/data/metadata_kc_puma.tsv'

In [15]:
def strains_mapping(alignment, metadata):
    '''
    Returns dictionary mapping strains to location and sequence.
    Only includes strains with region = Seattle.
    Removes strains with location = '?'.
    '''
    diction = {}
    ambig = ['N', 'R', 'M', 'S', 'Y', 'W', 'K', 'D', 'B', 'H', '-']
    with open(alignment) as align:
        for index, record in enumerate(SeqIO.parse(align, 'fasta')):
            seq = str(record.seq)
            seq = seq.replace('A', '0')
            seq = seq.replace('T', '1')
            seq = seq.replace('G', '2')
            seq = seq.replace('C', '3')
            for letter in ambig:
                    seq = seq.replace(letter, "4")
            seq_array = np.asarray(list(seq), dtype = int)
            diction[record.id] = {'seq' : seq_array, 'location' : ''}
    with open(metadata) as mfile:
        meta_file = pd.read_csv(mfile, sep = '\t')
        for row in meta_file.itertuples():
            if row.location == 'King County':
                if row.strain in diction:
                    diction[row.strain]['ns_kc'] = row.ns_kc
                    diction[row.strain]['location'] = row.location
    mapping = {strains: attributes for strains, attributes in diction.items() if attributes['location'] != '' and attributes['ns_kc'] != "other_King County"}
    #subsetting to only SCAN seqs that have a n/s info
    mapping_df = pd.DataFrame.from_dict(mapping, orient = "index")
    mapping_df = mapping_df.groupby("ns_kc").sample(n=148, random_state=545647) #smashed on keyboard
    mapping_sub = mapping_df.to_dict('index')
    return mapping_sub


def hamming(array1, array2):
    '''
    calculates the number of nucleotide differences per site for each pair while not including those site with ambiguities
    '''
    array1_mask = np.ma.masked_where(array1 > 3, array1)
    array2_mask = np.ma.masked_where(array2 > 3, array2)
    return np.sum(array1_mask != array2_mask)

def genetic_distance(mapping):
    '''
    Returns dictionary containing strain, closest strain, and genetic distance (computed as hamming distance) to closest strain.
    '''
    counter = 0
    interval = 100
    length = len(mapping)
    gen_distance_dict = {}
    seq_dict = {}
    for strainA, attributesA in mapping.items():
        
        
        if counter % interval == 0:
            print("[", end = '')
            for x in range(int(counter/interval)):
                print("-", end = '')
            for x in range(int(length/interval) - int(counter/interval)):
                print(" ", end = '')
            print("]")
        
        gen_distance_low = None
        for strainB, attributesB in mapping.items():
            if strainA != strainB:
                gen_distance = hamming(mapping[strainA]['seq'], mapping[strainB]['seq'])
                if gen_distance_low is None or gen_distance < gen_distance_low:
                    gen_distance_low = gen_distance
                    strain_name = strainB
                    strain_loc = attributesB['ns_kc']
               # seq_dict[strainA] = {'percent missing': percent_missing, 'total length': len_seq}
        gen_distance_dict[strainA] = {'gen_distance' : gen_distance_low, 'kc_loc': attributesA['ns_kc'], 'closest_strain' : strain_name, 'closest_strain_loc': strain_loc}
        
        counter += 1
    return gen_distance_dict


In [16]:
   #Make dictionary mapping strain to location and sequence
    strains = strains_mapping(alignment, metadata)

    
    #Compute genetic distance for each strain
    gen_distance = genetic_distance(strains)


  if (await self.run_code(async_code, result, async_=True)):


[  ]
[- ]
[--]


In [14]:
    mapping_df = pd.DataFrame.from_dict(strains, orient = "index")
    mapping_df = mapping_df.groupby("ns_kc").sample(n=1, random_state=135354) #smashed on keyboard
    mapping_sub = mapping_df.to_dict('index')


In [15]:
mapping_sub

{'USA/WA-S2848/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S2726/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S2777/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S590/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S3923/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S5042/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S1485/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-S2771/2020': {'seq': array([4, 4, 4, ..., 4, 4, 4]),
  'location': 'King County',
  'ns_kc': 'North_King_County'},
 'USA/WA-

In [17]:
distance_df = pd.DataFrame.from_dict(gen_distance, orient = 'index')

In [6]:
distance_df

Unnamed: 0,gen_distance,kc_loc,closest_strain,closest_strain_loc
USA/WA-S3488/2020,3,North_King_County,USA/WA-S3895/2021,South_King_County
USA/WA-S2715/2020,1,North_King_County,USA/WA-S2714/2020,North_King_County
USA/WA-S3812/2021,1,North_King_County,USA/WA-S3813/2021,North_King_County
USA/WA-S3503/2020,0,North_King_County,USA/WA-S4669/2020,North_King_County
USA/WA-S3813/2021,0,North_King_County,USA/WA-S3817/2021,North_King_County
...,...,...,...,...
USA/WA-S3761/2020,1,South_King_County,USA/WA-S3765/2020,South_King_County
USA/WA-S5014/2020,6,South_King_County,USA/WA-S2893/2020,South_King_County
USA/WA-S3471/2020,2,South_King_County,USA/WA-S3499/2020,South_King_County
USA/WA-S4666/2020,13,South_King_County,USA/WA-S591/2020,North_King_County


In [18]:
distance_df['trans_pair'] = np.nan
for index, row in distance_df.iterrows():
    if row['kc_loc'] and row['closest_strain_loc'] == 'North_King_County':
        distance_df.loc[index,'trans_pair'] = "NN"
    if row['kc_loc'] and row['closest_strain_loc'] == 'South_King_County':
        distance_df.loc[index,'trans_pair'] = "SS"
    elif row['kc_loc'] != row['closest_strain_loc']: 
        distance_df.loc[index,'trans_pair'] = "NS"

In [19]:
distance_df['trans_pair'].value_counts()

NN    120
SS    103
NS     73
Name: trans_pair, dtype: int64

In [70]:
def write_histo(gen_dis_dict, histo = "test_histo.png", table = "test_table.tsv"):
        distance_df = pd.DataFrame.from_dict(gen_distance, orient = 'index')
        fig, ax1 = plt.subplots(figsize = (15, 10))
        ax1.hist(distance_df.gen_distance, bins = len(distance_df.gen_distance), width = 5, align = 'left', color="#4C90C0")
        #ax1.set_xlim(0, 25)
        plt.xlabel('distance (# of nucleotides)', size = '32')
        plt.ylabel('count', size = '32')
        ax1.spines['right'].set_visible(False)
        ax1.spines['top'].set_visible(False)
        ax1.tick_params(axis='both', which='major', labelsize=28)
        ax1.tick_params(axis='both', which='minor', labelsize=28)
        plt.tight_layout()
        fig.savefig(histo)
        plt.close()


In [63]:
 write_histo(gen_distance, histo = "test2_histo.png")

NameError: name 'write_histo' is not defined

In [9]:
distance_df

Unnamed: 0,gen_distance,kc_loc,closest_strain,closest_strain_loc,trans_pair
USA/WA-S3488/2020,3,North_King_County,USA/WA-S3895/2021,South_King_County,SS
USA/WA-S2715/2020,1,North_King_County,USA/WA-S2714/2020,North_King_County,NN
USA/WA-S3812/2021,1,North_King_County,USA/WA-S3813/2021,North_King_County,NN
USA/WA-S3503/2020,0,North_King_County,USA/WA-S4669/2020,North_King_County,NN
USA/WA-S3813/2021,0,North_King_County,USA/WA-S3817/2021,North_King_County,NN
...,...,...,...,...,...
USA/WA-S3761/2020,1,South_King_County,USA/WA-S3765/2020,South_King_County,SS
USA/WA-S5014/2020,6,South_King_County,USA/WA-S2893/2020,South_King_County,SS
USA/WA-S3471/2020,2,South_King_County,USA/WA-S3499/2020,South_King_County,SS
USA/WA-S4666/2020,13,South_King_County,USA/WA-S591/2020,North_King_County,NS


In [25]:
chart = alt.Chart(distance_df).mark_bar().encode(
    alt.X('gen_distance', bin=alt.Bin(extent=[0, 30], step=1)), 
    alt.Color('trans_pair:N'),
    alt.Y("count()", stack="normalize", title='count')

)

text = alt.Chart(distance_df).mark_text(align='center', baseline='middle', dy = 15).encode(
    x=alt.X('gen_distance', bin=alt.Bin(extent=[0, 30], step=1)),
    y=alt.Y('count()', stack="normalize"),
    detail='trans_pair:N',
    text=alt.Text('count()')
)
chart + text


In [None]:
# chart.save('gen_distance_kc_1_step.png')

In [34]:
distance_df.gen_distance.value_counts()

0     77
1     58
3     20
2     19
6     18
7     18
4     15
8     14
9      8
5      7
11     7
10     6
13     6
12     5
14     5
16     3
18     3
15     2
17     2
20     2
25     1
19     1
27     1
Name: gen_distance, dtype: int64

In [74]:
bins = [-0.01,5, 10, 15, 20, 25, 30]
pd.crosstab(pd.cut(distance_df.gen_distance, bins = bins),distance_df.trans_pair, normalize = 'index')

trans_pair,NN,NS,SS
gen_distance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-0.01, 5.0]",0.412621,0.184466,0.402913
"(5.0, 10.0]",0.396552,0.310345,0.293103
"(10.0, 15.0]",0.333333,0.541667,0.125
"(15.0, 20.0]",0.571429,0.285714,0.142857
"(25.0, 30.0]",0.0,1.0,0.0


In [89]:
test =  range(0, 0.001, 0.0001)

TypeError: 'float' object cannot be interpreted as an integer

In [88]:
for n in test:
  print(n)

0
2
4
6
8


In [68]:
pd.cut(distance_df.gen_distance, 10
       )

USA/WA-S267/2020      (9.83e-05, 0.000197]
USA/WA-S266/2020     (-9.83e-07, 9.83e-05]
USA/WA-S271/2020     (-9.83e-07, 9.83e-05]
USA/WA-S285/2020      (9.83e-05, 0.000197]
USA/WA-S290/2020      (9.83e-05, 0.000197]
                             ...          
USA/WA-S5529/2021    (-9.83e-07, 9.83e-05]
USA/WA-S5547/2021    (-9.83e-07, 9.83e-05]
USA/WA-S5510/2021      (0.000492, 0.00059]
USA/WA-S5504/2021    (-9.83e-07, 9.83e-05]
USA/WA-S5505/2021     (0.000197, 0.000295]
Name: gen_distance, Length: 581, dtype: category
Categories (10, interval[float64]): [(-9.83e-07, 9.83e-05] < (9.83e-05, 0.000197] < (0.000197, 0.000295] < (0.000295, 0.000393] ... (0.00059, 0.000688] < (0.000688, 0.000786] < (0.000786, 0.000885] < (0.000885, 0.000983]]