In [None]:
from Bio import SeqIO
from Bio import Align
from Bio import pairwise2 as pw2
import numpy as np
import pandas as pd

In [None]:
def pairwise_seq_similarity(fasta1,fasta2) :
    #read fasta file
    seq1 = SeqIO.read(fasta1, "fasta").seq
    seq2 = SeqIO.read(fasta2, "fasta").seq

    match = 2
    mismatch = -1
    gap_open = -10
    gap_extend = -0.5

    aligner = Align.PairwiseAligner()
    aligner.open_gap_score = gap_open
    aligner.extend_gap_score = gap_extend
    aligner.mode= 'global'
    align_result =  aligner.align(seq1, seq2)
        
    return align_result[0].score

In [None]:
seq_path = '/home/bruce1996/data/MCI/manuscript_v3/supplement/subtype/exist_lacto/'
query_seq = seq_path + 'Lactobacillus_paragasseri.fa'
seq_list = os.listdir(seq_path)
sim_score = np.zeros(len(seq_list))

for idx,seq in enumerate(seq_list) :
    print("Calculate pairwise similirity of %d st sequence : %s" % (idx,seq))
    sim_score[idx] = pairwise_seq_similarity(query_seq,seq_path + seq)

score_df = pd.DataFrame({'Score' : sim_score},index=[x.replace('.fa','') for x in seq_list])
score_df.to_csv('/home/bruce1996/data/MCI/manuscript_v3/supplement/sim_score_against_paragasseri.txt',sep='\t')

In [None]:
paragasseri = pd.read_csv('/home/bruce1996/data/MCI/manuscript_v3/supplement/sim_score_against_paragasseri.txt',sep='\t',index_col=0)
rogosae = pd.read_csv('/home/bruce1996/data/MCI/manuscript_v3/supplement/sim_score_against_rogosae.txt',sep='\t',index_col=0)

score_df = pd.concat([paragasseri,rogosae],axis=1)
score_df.columns = ["Lactobacillus_paragasseri","Lactobacillus_rogosae"]
score_df = score_df.sort_values(by=["Lactobacillus_paragasseri","Lactobacillus_rogosae"],ascending=False)

plt.figure(figsize=(7,10))
sns.heatmap(score_df,linewidths=.5,cmap='YlGnBu',annot=True,fmt='g')
plt.xticks(rotation=0)
plt.savefig('/home/bruce1996/data/MCI/manuscript_v3/supplement/sim_score_heatmap.png',dpi=300,format='png',bbox_inches='tight')