I considered analyzing sequence divergence of ISs but decided to go with plotting all and visual inspection.

Here, I annotate the sequences and visualization is done in the next (07) ipynb. ipynb.

# Load the data 

1. load IS borders
2. annotate the genes within as fasta
3. save as df
4. visualize 

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import sys
import os
import glob
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
sys.path.append('../src/SyRI_IS/')
import importlib, subprocess
import run_utils
from sklearn.cluster import AgglomerativeClustering
import SyRI_IS

In [2]:
analysis_dir = './is_variants_analysis/full'
os.makedirs(analysis_dir, exist_ok=True)

master_dir = 'exp/multiple_runs11/'
exportdir = os.path.join(master_dir, 'export')
tmpdir = os.path.join(master_dir, 'tmp')
fastadir = '../fasta/'

In [3]:
# fasta file of all detected iss
is_fasta = os.path.join(analysis_dir, 'iss.fasta')
# blastn output of genes inside iss
blastn_out = os.path.join(analysis_dir, 'annotation_of_variants.blastn')

# make fasta file of all iss and ideally, its cluster ID
## input is positions
is_positions = pd.read_csv(os.path.join(exportdir, 'classify_IS_events', 'IS_positions.csv'))
is_positions = is_positions.sort_values(['length', 'Line', 'Gen', 'start'])
is_positions['global_is_id'] = np.arange(is_positions.shape[0])

## cluster iss
is1 = os.path.join(fastadir, 'IS1.fasta')
is1_seq = SeqIO.read(is1, 'fasta').seq
# cluster by lengths and get unique variants
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=100, linkage='complete')
clustering = clustering.fit(is_positions.length.values.reshape(-1, 1))
is_positions['variant_cluster'] = clustering.labels_

records = []
for i, row in is_positions.iterrows():
	seq = SeqIO.read(os.path.join(tmpdir, row.Line, f'{row.Line}_genome.{row.Gen}.fasta'), 'fasta')
	seq_ = seq.seq[row.start-1:row.end]
	if row.IS_strand == 'reverse':
		seq_ = seq_.reverse_complement()
	records.append(SeqRecord(seq_, id=f'{row.Line}.{row.Gen}.{row.cluster_id}', description='', name=''))
SeqIO.write(records, is_fasta, 'fasta') # make fasta file
is_positions.to_csv(os.path.join(analysis_dir, 'IS_positions.csv'), index=False) # save positions
dbdir = SyRI_IS.makedb(is_fasta, 'NA', analysis_dir)

# annotate the genes inside the iss
C = 'blastn -task blastn -query ' + os.path.join('../../test/refs/IS1_internal.fasta') + ' -evalue 1e-5 -db ' + dbdir
# max_target_seqs is set to 100000 to get all hits this is essential when using all
C += ' -outfmt "7 sseqid qseqid qstart qend sstart send length evalue bitscore pident " -max_target_seqs 100000 -out ' 
C += blastn_out
!{C}

variant_annotation_df = pd.read_csv(blastn_out, sep='\t', comment='#', header=None)
variant_annotation_df.columns = ['sseqid', 'qseqid', 'qstart', 'qend', 'sstart', 'send', 'length', 'evalue', 'bitscore', 'pident']
variant_annotation_df.head()



Building a new DB, current time: 07/03/2024 13:53:51
New DB name:   /home/kanai/documents/analysis/myTELR/syri/ipynb/is_variants_analysis/full/blastdb/iss.fasta
New DB title:  ./is_variants_analysis/full/iss.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/kanai/documents/analysis/myTELR/syri/ipynb/is_variants_analysis/full/blastdb/iss.fasta
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 2369 sequences in 0.041986 seconds.




Unnamed: 0,sseqid,qseqid,qstart,qend,sstart,send,length,evalue,bitscore,pident
0,L10-2.3.19,IS1-IRL,1,23,1,23,23,5e-06,42.8,100.0
1,L10-2.3.19,IS1-IRL,1,23,696,718,23,5e-06,42.8,100.0
2,L10-2.2.20,IS1-IRL,1,23,1,23,23,5e-06,42.8,100.0
3,L10-2.2.20,IS1-IRL,1,23,695,717,23,5e-06,42.8,100.0
4,L02-2.2.22,IS1-IRL,1,23,1,23,23,5e-06,42.8,100.0


In [4]:
# annotate the genes inside the iss
dbdir = os.path.join(analysis_dir, 'blastdb', 'iss.fasta')
blastn_out = os.path.join(analysis_dir, 'annotation_of_variants.megablast')
C = 'blastn -task megablast -query ' + os.path.join('../../test/refs/IS1_internal.fasta') + ' -db ' + dbdir
C += ' -outfmt "7 sseqid qseqid qstart qend sstart send length evalue bitscore pident " -out '
C += blastn_out
!{C}

variant_annotation_df = pd.read_csv(blastn_out, sep='\t', comment='#', header=None)
variant_annotation_df.columns = ['sseqid', 'qseqid', 'qstart', 'qend', 'sstart', 'send', 'length', 'evalue', 'bitscore', 'pident']
variant_annotation_df.head()

Unnamed: 0,sseqid,qseqid,qstart,qend,sstart,send,length,evalue,bitscore,pident
0,L10-2.3.19,IS1FS,1,699,1059,1757,699,0.0,1291.0,100.0
1,L10-2.3.19,IS1FS,323,699,3789,4165,377,0.0,697.0,100.0
2,L10-2.3.19,IS1FS,323,699,6197,6573,377,0.0,697.0,100.0
3,L10-2.3.19,IS1FS,323,699,8605,8981,377,0.0,697.0,100.0
4,L10-2.3.19,IS1FS,323,699,11013,11389,377,0.0,697.0,100.0


In [5]:
variant_annotation_df.tail()

Unnamed: 0,sseqid,qseqid,qstart,qend,sstart,send,length,evalue,bitscore,pident
4090,L04-1.3.6,Full_IS1,2072,2108,2106,2070,37,1.36e-09,63.9,97.297
4091,L04-1.3.5,Full_IS1,1,3092,1,3093,3093,0.0,5705.0,99.968
4092,L04-1.3.5,Full_IS1,2918,2959,2006,1965,42,4.85e-14,78.7,100.0
4093,L04-1.3.5,Full_IS1,1964,2005,2960,2919,42,4.85e-14,78.7,100.0
4094,L04-1.3.5,Full_IS1,2072,2108,2106,2070,37,1.36e-09,63.9,97.297


In [6]:
# length of ISs
is_fasta = os.path.join(analysis_dir, 'iss.fasta')
pd.DataFrame({'id': [s.id for s in SeqIO.parse(is_fasta, 'fasta')],
              		'length': [len(s.seq) for s in SeqIO.parse(is_fasta, 'fasta')]}).\
						to_csv(os.path.join(analysis_dir, 'is_variant_lengths.csv'), index=False)

The remaining analysis is performed in the following notebook (in R).
07_visualize_all_iss.ipynb