In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from Bio import pairwise2
from tqdm.notebook import tqdm

In [2]:
from Bio.SubsMat import MatrixInfo as matlist
matrix = matlist.blosum62

def percent_identity(sequence_1, sequence_2, matrix = matlist.blosum62):
    align_outputs = pairwise2.align.globaldd(sequence_1, sequence_2, matrix, -11, -1, -11, -1)
    results = []
    for align_out in align_outputs: 
        string_1, string_2, end = align_out.seqA, align_out.seqB, align_out.end
        counter = 0
        for a, b in zip(string_1, string_2):
            if a != '-' and b != '-' and a ==b:
                counter = counter +1
        results.append(counter / end)
    return np.max(results)



In [3]:
full_df = pd.read_csv('../data_extracted/all_data.csv')
df = full_df[['virus', 'protein', 'aaseq']]
df['virus_int'] = df['virus'].apply(lambda x: int(x[3:]))
viruses = df.sort_values('virus_int')['virus'].drop_duplicates().values
pd.DataFrame(viruses).to_csv('../results/figure_1_viruses.csv')
print(viruses)

['HPV1' 'HPV2' 'HPV3' 'HPV4' 'HPV5' 'HPV7' 'HPV8' 'HPV9' 'HPV10' 'HPV12'
 'HPV14' 'HPV15' 'HPV16' 'HPV17' 'HPV18' 'HPV19' 'HPV20' 'HPV21' 'HPV22'
 'HPV23' 'HPV24' 'HPV25' 'HPV27' 'HPV28' 'HPV29' 'HPV31' 'HPV32' 'HPV33'
 'HPV34' 'HPV35' 'HPV36' 'HPV37' 'HPV38' 'HPV39' 'HPV41' 'HPV42' 'HPV45'
 'HPV47' 'HPV48' 'HPV49' 'HPV50' 'HPV51' 'HPV52' 'HPV54' 'HPV56' 'HPV57'
 'HPV58' 'HPV59' 'HPV60' 'HPV61' 'HPV65' 'HPV67' 'HPV68' 'HPV69' 'HPV70'
 'HPV75' 'HPV76' 'HPV80' 'HPV88' 'HPV92' 'HPV93' 'HPV95' 'HPV96' 'HPV100'
 'HPV132' 'HPV151' 'HPV197']


In [4]:
proteins = ['E1', 'E2', 'E6', 'E7']

In [5]:
results = np.zeros((len(proteins), len(viruses), len(viruses)))

for i, protein in enumerate(proteins):
    for j, left_virus in tqdm(enumerate(viruses)):
        for k, right_virus in enumerate(viruses):
            left_sequence = df[(df['protein'] == protein) &
                                (df['virus'] == left_virus)].aaseq.values[0]
            right_sequence = df[(df['protein'] == protein) & 
                                (df['virus'] == right_virus)].aaseq.values[0]
            results[i, j, k] = percent_identity(left_sequence,
                                                right_sequence)
            
with open('../results/figure_1.pkl', 'wb') as file:
    pkl.dump(results, file)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]