adapted from https://github.com/AnneliektH/EVEs_arthropod/blob/master/parse_xml.py

In [2]:
#from __future__ import division
from Bio.Blast import NCBIXML
import csv
import sys
import pandas as pd

In [2]:
result = NCBIXML.parse(open('/Users/callamartyn/chou_lab/EVE/out/Iscap_blastx_vprot_1e-03.out'))
output = '/Users/callamartyn/chou_lab/EVE/out/Iscap.csv'

In [3]:
# Write a header for the outputfile
header = ('sequence', 'length', 'perc_identity', 'gaps', 'frame', 'position_on_hit_start',
          'position_on_hit_stop', 'position_on_query_start', 'position_on_query_stop', 'evalue', 'score',  'direction')

In [4]:
# open the outputfile
with open(output,'w') as f:
  writer = csv.writer(f)
  writer.writerow(header)

  # Go into fasta records
  for record in result:

    # Go into fasta alignments
    if record.alignments:

      # Check each alignment
      for alignment in record.alignments:

          # Make recognizable names for all xml input objects.
          for hsp in alignment.hsps:
            sequence = alignment.title
            length = hsp.align_length
            perc_identity = float((hsp.identities/hsp.align_length)*100)
            gaps = hsp.gaps
            query_frame = hsp.frame
            direction = record.query

            # Hit is viral hit from viral database
            position_on_hit_start = hsp.sbjct_start
            position_on_hit_stop = hsp.sbjct_end

            # Query is piRNA cluster of insect
            position_on_query_start = hsp.query_start
            position_on_query_stop = hsp.query_end
            evalue = hsp.expect
            score = hsp.score

            # Write to csv
            row = (sequence, length, perc_identity, gaps, query_frame[0],
            position_on_hit_start, position_on_hit_stop ,position_on_query_start,
            position_on_query_stop, evalue, score, direction)
            writer.writerow(row)

  # close the file
  f.close()
  result.close()


ExpatError: no element found: line 65921, column 49

In [5]:
len(result)

TypeError: object of type 'generator' has no len()

In [None]:
df = pd.read_csv('/Users/callamartyn/chou_lab/EVE/test.csv')

In [None]:
# max eval on position_on_query_start is equal
max_eval = df.groupby(['sequence', 'position_on_query_start']).evalue.transform(max)
df4 = df[df.evalue == max_eval]

# max eval on position_on_query_stop is equal
max_eval = df.groupby(['sequence', 'position_on_query_stop']).evalue.transform(max)
df5 = df[df.evalue == max_eval]

# merge both max tables
df = df4.append(df5)

# and remove where start sequence is equal
df = df.drop_duplicates(['sequence', 'position_on_query_start'])

# remove where stop sequence is equal
df = df.drop_duplicates(['sequence', 'position_on_query_stop'])

#remove where stop and start are equal
df = df.drop_duplicates([ 'sequence', 'position_on_query_start', 'position_on_query_stop'])

# output to csv
df.to_csv(output.rstrip('.csv')+'_filtered.csv', index=False)

result.close()

Trying to replace first part of BLAST_filter.sh (making a bed file)

In [8]:
# read in filtered file
df = pd.read_csv('/Users/callamartyn/chou_lab/EVE/test_filtered.csv')

In [9]:
# get query start and end positions from full table
bed = df.iloc[:,7:9]
# get accession number from "direction" column and insert into first position
bed.insert(0,'accession', [x.split(' ')[0] for x in df.direction])

In [10]:
bed

Unnamed: 0,accession,position_on_query_start,position_on_query_stop
0,DS981339.1,519,761
1,DS981339.1,666,1025
2,DS981338.1,292,390
3,DS981338.1,92,364
4,DS981338.1,593,811
5,DS981338.1,527,685
6,DS981337.1,229,507
7,DS981337.1,229,507
8,DS981337.1,970,1197
9,DS981337.1,211,324


In [None]:
#write out to a tsv with bed file suffix
bed.to_csv('/Users/callamartyn/chou_lab/EVE/out/bed_files/test.bed', sep='\t', header=False, index=False)

Figuring out how to filter duplicates

In [None]:
df = pd.read_csv('/Users/callamartyn/chou_lab/EVE/test_filtered.csv')
df

In [None]:
df.sort_values('evalue', inplace=True)
df.drop_duplicates(["direction", "position_on_query_start"], inplace=True, keep="first")
df.drop_duplicates(["direction", "position_on_query_stop"], inplace=True, keep="first")
df.reset_index(inplace=True, drop=True)
df

Filtering overlapping hits

In [None]:
df_grouped=df.groupby('direction')

In [None]:
# list to store index that are either unique enough or have highest evalue
results = []
# list to save those that have already been added so they can be skiped
to_be_skipped = []

In [None]:
for group_name, df_group in df_grouped:
    
    for index, row in df_group.iterrows():

        # check if sequence or simmilar sequence already added
        if index in to_be_skipped:
            continue

        # initialize empty simmilar dict
        similar = {}

        for index2, row2 in df_group.iterrows():

            # check if possition start or stop is equal and is not self.
            if index == index2:
                continue

            # check if possition start or stop is equal and is not self.
                # if entry is comparing to itself
            if row[7] == row2[7] and row2[8] == row[8]:
                continue

            elif (row[7] in range(row2[7], row2[8]) or
            row2[7] in range(row[7], row[8]) or
            row[8] in range(row2[7], row2[8]) or
            row2[8] in range(row[7], row[8])):
                # add both indexes of simmilar sequences plus their score to the dict
                similar[index] = row[10]
                similar[index2] = row2[10]

        # check if simmilar sequences have been found
        if len(similar) > 0:

            # get the max score from the simmilar sequences
            max_index = max(similar, key=similar.get)

            # add index with maximum score to results list
            results.append(max_index)

            # add checked indices to be skipped list
            for k,v  in similar.items():
                to_be_skipped.append(k)

        # if seqeunce is unique add index to results
        if len(similar) == 0:
            results.append(index)
            to_be_skipped.append(index)


In [None]:
df.sort_values('direction', inplace=True)
df

In [None]:
df_unique = df.loc[results]
df_unique.to_csv('/Users/callamartyn/chou_lab/EVE/out/test_unique.csv')