# Details about true plasmids for sample 117

In [38]:
from IPython.display import Image
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [1]:
from __future__ import division
import numpy as np
import pandas as pd

def read_file(filename):
	string = open(filename, "r").read()
	string_list = string.split("\n")
	string_list = [line for line in string_list if line and line[0] != '#'] #Read line only if it is nonempty and not a comment.
	return string_list

#Storing contig details
#-----------------------------------------------
#Stores the id of the contig
def get_id(line):
	return line[1]
#Stores the nucleotide sequence of the contig
def get_nucleotide_seq(line):
	#print(line[2])
	return line[2]		
#Computes GC ratio: counts no. of 'G'/'C' occurences in the sequence and divide by the sequence length.
def compute_GCratio(seq):
	GC = 0
	ln_seq = 0
	for nucl in seq:
		if nucl == 'G' or nucl == 'C':
			GC += 1
		ln_seq += 1
	return GC/ln_seq
#Stores the length of the sequence
def get_length(line):
	return int(line[3].split(':')[2])
#Stores the read depth of the contig
def get_read_depth(line):
	return float(line[4].split(':')[2])		

#Takes a contig from the assembly file and initiates an entry in the contigs_dict
#Each contig is tagged with the following attributes:
#1. Length of the contig (int)
#2. Overall read depth of the contig (float)
#3. Indication if the contig is a seed (binary)
#4. GC content of the contig (float)
#5. Gene coverage intervals (list of pairs)
#6. Gene coverage (float)
def update_contigs_dict(contigs_dict, line):
	c = get_id(line)
	seq = get_nucleotide_seq(line) 
	GC_cont = compute_GCratio(seq)
	ln = get_length(line)
	rd = get_read_depth(line)

	contigs_dict[c] = {}
	contigs_dict[c]['Sequence'] = seq
	contigs_dict[c]['Length'] = ln
	contigs_dict[c]['Read_depth'] = rd
	contigs_dict[c]['Seed'] = 0							#Default
	contigs_dict[c]['GC_cont'] = GC_cont
	contigs_dict[c]['Gene_coverage_intervals'] = []		#Default
	contigs_dict[c]['Gene_coverage'] = 0				#Default
	contigs_dict[c]['Density'] = 0				#Default
	return contigs_dict

#A link is of the type: ((l1, e1),(l2, e2)) 
#where l1, l2 are adjacent links and e1, e2 are the connected link extremities
def get_link(line):
	c1, o1, c2, o2 = line[1], line[2], line[3], line[4]
	if o1 == '+':
		ext1 = 'h'
	else:
		ext1 = 't'
	if o2 == '+':
		ext2 = 't'
	else:
		ext2 = 'h'
	e = ((c1, ext1),(c2, ext2))	
	return e 

#Reads the assembly file line by line and forwards a line 
#to update_contigs_dict or get_link depending on the entry
def get_data(assembly_file, contigs_dict, links_list):
	string_list = read_file(assembly_file)
	count_s = 0
	count_l = 0
	for line in string_list:
		line = line.split("\t")
		if line[0] == 'S':
			contigs_dict = update_contigs_dict(contigs_dict, line)
		elif line[0] == 'L':
			e = get_link(line)
			links_list.append(e)
	return contigs_dict, links_list

#Reads the seed file and makes a list of seeds
def get_seeds(seeds_file, seeds_set):
	string_list = read_file(seeds_file)
	for line in string_list:
		line = line.split("\t")
		seeds_set.add(line[0])
	return seeds_set

#Takes the gene covering intervals for a contig and finds their union
#The length of the union is used to compute gene coverage
def get_union(intervals):
	union = []
	for begin, end in sorted(intervals):
		if union and union[-1][1] >= begin - 1:
			union[-1][1] = max(union[-1][1], end)
		else:
			union.append([begin, end])
	return union		

#Computes the gene coverage for each contig
def get_gene_coverage(mapping_file, contigs_dict):
	string_list = read_file(mapping_file)
	possible_seeds = []
	for line in string_list:
		line = line.split("\t")	
		qseqid, sseqid = line[0], line[1]
		sstart, send = line[8], line[9]
		if sseqid not in possible_seeds:
			possible_seeds.append(sseqid)
		if sseqid not in contigs_dict:
			print(sseqid, "not in contigs_dict")
		else:
			if int(sstart) > int(send):
				contigs_dict[sseqid]['Gene_coverage_intervals'].append((int(send), int(sstart)))
			else:
				contigs_dict[sseqid]['Gene_coverage_intervals'].append((int(sstart), int(send)))

	for sseqid in contigs_dict:
		union = get_union(contigs_dict[sseqid]['Gene_coverage_intervals'])
		ln = contigs_dict[sseqid]['Length']
		covered = 0
		for interval in union:
			covered += interval[1] - interval[0] + 1
		contigs_dict[sseqid]['Gene_coverage'] = covered/ln
		if contigs_dict[sseqid]['Gene_coverage'] > 0:
			contigs_dict[sseqid]['Density'] = 1

	return contigs_dict

In [2]:
sample_id = str(117)

sample_dir = '/home/aniket/python_scripts/Plasmids/data/unicycler_pipeline/'

assembly_file = sample_dir + 'sample_' + sample_id + '/assembly.gfa'
mapping_file = sample_dir + 'sample_' + sample_id + '/filtered_genes_to_contigs.csv'
seeds_file = sample_dir + 'sample_' + sample_id + '/seed_contigs.csv'

contigs_dict = {}
links_list = []
seeds_set = set()

contigs_dict, links_list = get_data(assembly_file, contigs_dict, links_list)
seeds_set = get_seeds(seeds_file, seeds_set)
contigs_dict = get_gene_coverage(mapping_file, contigs_dict)

In [3]:
chain_file = 'contig_chains.csv'

contig_chains = {}
string_list = read_file(chain_file)
for line in string_list:
    plasmid_name = line.split(';')[0]
    chain = line.split(';')[1]
    contig_list = [x[:-1] for x in chain.split(',')]
    contig_chains[plasmid_name] = contig_list   

## Individual plasmids

In [4]:
print("Number of contigs: ", len(contigs_dict))
print("Number of links: ", len(links_list))

Number of contigs:  110
Number of links:  154


In [5]:
objective_values = [] 
plasmid_contigs = {}
for plasmid in contig_chains:
    contig_list = contig_chains[plasmid]
    avg_gd, avg_GC, GC_pen, total_len = 0, 0, 0, 0
    plasmid_contigs[plasmid] = []
    for contig in contig_list:
        plasmid_contigs[plasmid].append([contig,contigs_dict[contig]['Gene_coverage'],contigs_dict[contig]['GC_cont'],contigs_dict[contig]['Read_depth'],contigs_dict[contig]['Length']])
        
        total_len += contigs_dict[contig]['Length']
        avg_gd += contigs_dict[contig]['Gene_coverage']*contigs_dict[contig]['Length']
        avg_GC += contigs_dict[contig]['GC_cont']*contigs_dict[contig]['Length']
    avg_gd = avg_gd/total_len
    avg_GC = avg_GC/total_len
    for contig in contig_list:
        GC_pen += abs(avg_GC - contigs_dict[contig]['GC_cont'])*contigs_dict[contig]['Length']
    GC_pen = GC_pen/total_len
    
    plasmid_contigs[plasmid] = pd.DataFrame(plasmid_contigs[plasmid])
    plasmid_contigs[plasmid].rename(columns = {0: 'Contig', 1: 'Gene density', 2: 'GC content', 3: 'Read depth', 4: 'Length'}, inplace = True)  
    
    objective_values.append([plasmid, avg_gd, GC_pen, total_len])
    
objective_values = pd.DataFrame(objective_values)
objective_values.rename(columns = {0: 'Plasmid', 1: 'Gene density', 2: 'GC penalty', 3: 'Length'}, inplace = True)  
    

In [6]:
for plasmid in plasmid_contigs:
    print(plasmid)
    print(plasmid_contigs[plasmid])
    print("\n\n")

CP012933.1
  Contig  Gene density  GC content  Read depth  Length
0     32      0.617656    0.433564   28.637866    3319



CP012936.1
  Contig  Gene density  GC content  Read depth  Length
0     16      0.868749    0.482067    1.943970   45950
1     62      1.000000    0.666667    2.412919     132
2     23      0.941576    0.568938    1.709725    7514
3     62      1.000000    0.666667    2.412919     132
4     17      0.907291    0.504197    2.004008   44203
5     49      0.919588    0.441237    2.443771     485
6     88      0.000000    0.617021    0.696625      47
7     47      1.000000    0.458955    2.552712     536



CP012935.1
  Contig  Gene density  GC content  Read depth  Length
0     29      0.772855    0.510627   18.252055    3905



CP012931.1
   Contig  Gene density  GC content  Read depth  Length
0      15      0.795411    0.445887    1.221212   52520
1      42      1.000000    0.449242    2.579621     857
2      58      1.000000    0.468085    2.872408     188
3      2

In [7]:
objective_values

Unnamed: 0,Plasmid,Gene density,GC penalty,Length
0,CP012933.1,0.617656,0.0,3319
1,CP012936.1,0.892383,0.016502,98999
2,CP012935.1,0.772855,0.0,3905
3,CP012931.1,0.828968,0.027852,239113
4,CP012934.1,0.38879,0.0,3372
5,CP012932.1,0.472328,0.0,2096


# Connected components

In [8]:
conn_comp = []
for plasmid in contig_chains:
    common = 0
    #print(contig_chains[plasmid])
    if len(conn_comp) == 0:
        conn_comp.append(contig_chains[plasmid])
    else:
        for comp in conn_comp:
            if len(list(set(comp) & set(contig_chains[plasmid]))) != 0:
                print("Intersects", list(set(comp) & set(contig_chains[plasmid])))
                temp = (list(set().union(comp, contig_chains[plasmid])))
                conn_comp.remove(comp)
                conn_comp.append(temp)
                common = 1
                break
            else:
                common = 0
        if common == 0:
            conn_comp.append(contig_chains[plasmid])
print(conn_comp)                

[['32'], ['16', '62', '23', '62', '17', '49', '88', '47'], ['29'], ['15', '42', '58', '24', '58', '35', '42', '11', '43', '20', '43', '54', '53', '26', '53', '44', '43', '28', '28'], ['31'], ['33']]
