** some thoughts : **

- you can do all the variables attributes, which are None when initialized
- it's better to 'unload' main working method:
- create get_dataframe method instead of optional parameter: it'll get the current distance state and get it's data, it's more logical: you obj.simgen() > get plot you like, then obj.get_dataframe >> get data for analysis or storage
- don't forget to make attributes private, f.e. distance should be private, as user can modify it somehow and the results will be broken


In [275]:
from Bio.Align import MultipleSeqAlignment

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import pandas as pd
import plotly.graph_objs as go
from Bio import AlignIO
init_notebook_mode(connected=True)




class Simgen(MultipleSeqAlignment):
    
    
    def __init__(self, path):
        """initializing Simgen"""
        from Bio import AlignIO
        recs_prepared = (x for x in AlignIO.read(path, "fasta")) # it works without it, but i believe it's formally right
        super(Simgen, self).__init__(recs_prepared)  # you need explicitly call the __init__ of the upperclass
        
        self.distance = {}  # empty, until call of the simgen function
        self.align = None  # current slice or whole MultipleSeqRecord for plotting
        self.ticks = None  # ticks for plot
    
    
    
    def simgen(self, pot_rec, window=1, shift=1, region=False, draw=True):
        """slices the alignment, collects the distance data

        Parameters:
        -----------
        pot_rec: int
            the number of the sequence under study, starts with 0,
            like the 'x' dimension in the numpy array
            window: int
            sliding window size
        shift: int
            the step window slides downstream the alignment
        region: a tuple or a list of two integers
            the region of the alignment to analyze. the start
            and the end nucleotide positions
        return_data: bool, optional
            return the data in pandas DataFrame
            """

        assert window >=1, "wondow can't be a negative or zero"
        assert shift >= 1, "shift can't be a negative or zero" 

        
        if region:
            assert region[0] < region[1], "the value of the first nucleotide position should be less than the second one"
            
            collect_sliced = []
            for rec in self._records:  # access to seq of the SeqRecord obj inside MultipleSeqAlignment
                sliced_seq = rec.seq[region[0]:region[1]]
                collect_sliced.append(SeqRecord(sliced_seq, id=rec.id, name=rec.name, description=rec.description))

            self.align = MultipleSeqAlignment(collect_sliced)
        

            left_border = region[0]   # border for the first tick
            right_border = region[1]  # if region, 'right_border' is actual position

        else:
            collect_sliced = []
            for rec in self._records:  # access to seq of the SeqRecord obj inside MultipleSeqAlignment
                sliced_seq = rec.seq[:]
                collect_sliced.append(SeqRecord(sliced_seq, id=rec.id, name=rec.name, description=rec.description))

            self.align = MultipleSeqAlignment(collect_sliced)
        
            left_border = 1  # border for the first tick
            right_border = self.get_alignment_length()
        #print(align)
        #print(type(align))
        
        # TODO: from here go down and redo the code :
        # problems are: 'method name is not defined'
        
        # creating tick labels for the plot
        self._get_x_labels(left_border, right_border, shift)

        # calculating pairwise distance
        self._move_window(window, pot_rec, shift)
        
        
        if draw:
            self._draw_simplot()
        else:
            data = pd.DataFrame(data=self.distance, index=ticks[1:]) # [1:] to map data to index
            #data = distance_data # test line
            return data
        
    
    def _pdistance(self, seq1, seq2):
        """calculates pairwise distance between two sequences"""
        p = 0
        pairs = []
        for x in zip(seq1, seq2):
            if '-' not in x:
                pairs.append(x)
        for (x, y) in pairs:
            if x != y:
                p += 1
        length = len(pairs)
        #assert length > 0, "AssertionError: perhaps your alignment contains only or too many gaps"
        try:
            dist = float(1 - p / length)  # '1 - p' to take plot 'upside down'
            return dist
        except ZeroDivisionError as e:
            print(e, ": perhaps your alignment contains only gaps")

        
    def _get_x_labels(self, left_border, right_border, shift):
        """creates tick labels"""

        tick_container = []      
        tick_container.append(left_border)

        while tick_container[-1] < right_border:
            tick_container.append(tick_container[-1] + shift)
            if tick_container[-1] > right_border:
                tick_container[-1] = right_border

        self.ticks = tick_container


    def _move_window(self, window, pot_rec, shift):
        """moves window"""
        distance_data = {}
        parents = list(range(0, len(self.align)))
        parents.remove(pot_rec)
        align_length = len(self.align[0, :])

        for par in parents:
            dist_container = []
            start = 0
            finish = shift

            while start < align_length:
                seq1 = self.align[pot_rec, start:finish].seq # here is a potential recombinant sequence slice
                seq2 = self.align[par, start:finish].seq  # here's a parent's slice
                dist_container.append(self._pdistance(seq1, seq2)) #calculate pdistance, append to container
                start += shift
                finish = start + window

            distance_data[self.align[par].id] = dist_container

        self.distance = distance_data  # do i really should return? it's better to get access just right
    
    
    def _draw_simplot(self):
        """draws similarity plot"""

        data = []
        for key in self.distance.keys():
            print(key)
            trace = go.Scatter(y=self.distance[key], x=self.ticks, name=key)
            data.append(trace)

        layout = go.Layout(
            title="similarity plot",
            xaxis=dict(
                title="nucleotide position"),
            yaxis=dict(
                title="sequence identity"),
            legend=dict(x=-0.1, y=1.5))

        fig = go.Figure(data=data, layout=layout)
        iplot(fig)


In [276]:
simgen_obj = Simgen("./data/half_and_equal.fasta")

In [277]:
simgen_obj.simgen(pot_rec=0)

seq2_empty
seq3_empty


In [278]:
simgen_obj.distance

{'seq2_empty': [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'seq3_empty': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}

In [280]:
simgen_obj.simgen(pot_rec=0, region=(5,7))

seq2_empty
seq3_empty


In [255]:
simgen_obj.simgen(pot_rec=0)

SingleLetterAlphabet() alignment with 3 rows and 11 columns
AAAAATTTTTT seq1_empty
AAAAAGGGGGG seq2_empty
AAAAATTTTTT seq3_empty
<class 'Bio.Align.MultipleSeqAlignment'>


TypeError: _draw_simplot() takes 0 positional arguments but 1 was given

In [167]:
print(simgen_obj)

SingleLetterAlphabet() alignment with 3 rows and 11 columns
AAAAATTTTTT seq1_empty
AAAAAGGGGGG seq2_empty
AAAAATTTTTT seq3_empty


In [143]:
type(simgen_obj)

__main__.Simgen

In [99]:
recs = b[:, 0:1]

In [100]:
recs

<<class 'Bio.Align.MultipleSeqAlignment'> instance (3 records of length 1, SingleLetterAlphabet()) at 1967f36b4a8>

In [102]:
print(recs)

SingleLetterAlphabet() alignment with 3 rows and 1 columns
A seq1_empty
A seq2_empty
A seq3_empty


In [103]:
for rec in recs:
    print(rec)

ID: seq1_empty
Name: seq1_empty
Description: seq1_empty
Number of features: 0
Seq('A', SingleLetterAlphabet())
ID: seq2_empty
Name: seq2_empty
Description: seq2_empty
Number of features: 0
Seq('A', SingleLetterAlphabet())
ID: seq3_empty
Name: seq3_empty
Description: seq3_empty
Number of features: 0
Seq('A', SingleLetterAlphabet())


In [105]:
print(b)

SingleLetterAlphabet() alignment with 3 rows and 11 columns
AAAAATTTTTT seq1_empty
AAAAAGGGGGG seq2_empty
AAAAATTTTTT seq3_empty


pattern to create and object of multiple seq alignment and plot it

the initialized object isn't changed.

instead this object is stored in the 'align' var and is passed downstream

**in here try to slice the object : **

In [120]:
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
collect_sliced = []

for rec in b._records:
    print(rec)
    print(rec.seq)
    sliced_seq = rec.seq[2:4]
    print(sliced_seq)
    collect_sliced.append(SeqRecord(sliced_seq, id=rec.id, name=rec.name, description=rec.description))

slice_of_alignment = MultipleSeqAlignment(collect_sliced)
    

ID: seq1_empty
Name: seq1_empty
Description: seq1_empty
Number of features: 0
Seq('AAAAATTTTTT', SingleLetterAlphabet())
AAAAATTTTTT
AA
ID: seq2_empty
Name: seq2_empty
Description: seq2_empty
Number of features: 0
Seq('AAAAAGGGGGG', SingleLetterAlphabet())
AAAAAGGGGGG
AA
ID: seq3_empty
Name: seq3_empty
Description: seq3_empty
Number of features: 0
Seq('AAAAATTTTTT', SingleLetterAlphabet())
AAAAATTTTTT
AA


In [122]:
slice_of_alignment

<<class 'Bio.Align.MultipleSeqAlignment'> instance (3 records of length 2, SingleLetterAlphabet()) at 1967f37ceb8>

In [123]:
print(slice_of_alignment)

SingleLetterAlphabet() alignment with 3 rows and 2 columns
AA seq1_empty
AA seq2_empty
AA seq3_empty


In [124]:
print(slice_of_alignment[0])

ID: seq1_empty
Name: seq1_empty
Description: seq1_empty
Number of features: 0
Seq('AA', SingleLetterAlphabet())


In [91]:
b = Simgen("./data/half_and_equal.fasta")

In [92]:
b.simgen(pot_rec=0)

In [93]:
b

<<class '__main__.Simgen'> instance (3 records of length 11, SingleLetterAlphabet()) at 1967f34f710>

In [94]:
print(b)

SingleLetterAlphabet() alignment with 3 rows and 11 columns
AAAAATTTTTT seq1_empty
AAAAAGGGGGG seq2_empty
AAAAATTTTTT seq3_empty


In [95]:
a = Simgen("./data/half_and_equal.fasta")

In [96]:
a.simgen(pot_rec=0, region=(2,3))

TypeError: 'Simgen' object is not callable

In [97]:
a

<<class '__main__.Simgen'> instance (3 records of length 11, SingleLetterAlphabet()) at 1967f34f898>

In [98]:
print(a)

SingleLetterAlphabet() alignment with 3 rows and 11 columns
AAAAATTTTTT seq1_empty
AAAAAGGGGGG seq2_empty
AAAAATTTTTT seq3_empty
