Import sequences

Output: Alignment score & alignment with gaps

**Load matches.txt.**

First 2 columns represent start & end positions of matched regions for the human sequence. 

Last 2 columns represent the start & end positions of the matched regions for the fruit fly sequence. 

In [3]:
import os, numpy as np, re

In [4]:
fruit_fly_matches = []
human_matches = []

with open('Match_HOX.txt', 'r') as infile:
    for line in infile:
        positions = re.split('\t|\n', line)
        
        h_region = {'start': positions[0],
                    'end': positions[1]}
        ff_region = {'start': positions[2],
                     'end': positions[3]}
        
        human_matches.append(h_region)
        fruit_fly_matches.append(ff_region)
        
print(fruit_fly_matches)
print(human_matches)

[{'start': '286', 'end': '291'}, {'start': '299', 'end': '342'}, {'start': '344', 'end': '357'}]
[{'start': '120', 'end': '125'}, {'start': '130', 'end': '173'}, {'start': '175', 'end': '188'}]


Import sequences

Implement Needleman-Wunsch algorithm. 

In [5]:
seq_a = 'GCATGCU'
seq_b = 'GATTACA'

Construct a grid.

Match: +1

Mismatch or indel: -1

In [6]:
grid = np.empty((len(seq_b) + 1, len(seq_a) + 1)) # +1 for gap  seq a on top, seq b on side
grid[:] = np.nan
grid

array([[nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan]])

In [8]:
grid[0] = np.arange(0, -len(grid[0]), -1) # 1st row
grid[:,0] = np.arange(0, -len(grid[:,0]), -1) # 1st column
grid

array([[ 0., -1., -2., -3., -4., -5., -6., -7.],
       [-1., nan, nan, nan, nan, nan, nan, nan],
       [-2., nan, nan, nan, nan, nan, nan, nan],
       [-3., nan, nan, nan, nan, nan, nan, nan],
       [-4., nan, nan, nan, nan, nan, nan, nan],
       [-5., nan, nan, nan, nan, nan, nan, nan],
       [-6., nan, nan, nan, nan, nan, nan, nan],
       [-7., nan, nan, nan, nan, nan, nan, nan]])

In [9]:
it = np.nditer(grid, flags=['multi_index'])
for x in it:
    a = it.multi_index[1]
    b = it.multi_index[0]

    if a < 1 or b < 1:
        continue
    
    print()
    print('Cell: (' + str(a) + ',' + str(b) + ')')
    print(seq_a[a-1] + '—' + seq_b[b-1])
    
    # previous diagonal, top, left scores
    scores = np.array([grid[b-1, a-1], grid[b-1, a], grid[b, a-1]])
    print('Old scores [Diagonal, Top, Left]: ' + str(scores))
    
    # If pairing is a match, then
    if seq_a[a-1] == seq_b[b-1]:
        scores[0] += 1 # moving diagonally, only add +1
        scores[1] -= 1 # moving down represents an indel, so add score for indel
        scores[2] -= 1 # moving right also represents an indel
    else:
        scores[0] -= 1 # moving diagonally, only add +1
        scores[1] -= 1 # moving down represents an indel, so add score for indel
        scores[2] -= 1 # moving right also represents an indel
        
    print('New scores [Diagonal, Top, Left]: ' + str(scores))
    print('Highest score: ' + str(scores.max()))
    
    grid[b, a] = scores.max()


Cell: (1,1)
G—G
Old scores [Diagonal, Top, Left]: [ 0. -1. -1.]
New scores [Diagonal, Top, Left]: [ 1. -2. -2.]
Highest score: 1.0

Cell: (2,1)
C—G
Old scores [Diagonal, Top, Left]: [-1. -2.  1.]
New scores [Diagonal, Top, Left]: [-2. -3.  0.]
Highest score: 0.0

Cell: (3,1)
A—G
Old scores [Diagonal, Top, Left]: [-2. -3.  0.]
New scores [Diagonal, Top, Left]: [-3. -4. -1.]
Highest score: -1.0

Cell: (4,1)
T—G
Old scores [Diagonal, Top, Left]: [-3. -4. -1.]
New scores [Diagonal, Top, Left]: [-4. -5. -2.]
Highest score: -2.0

Cell: (5,1)
G—G
Old scores [Diagonal, Top, Left]: [-4. -5. -2.]
New scores [Diagonal, Top, Left]: [-3. -6. -3.]
Highest score: -3.0

Cell: (6,1)
C—G
Old scores [Diagonal, Top, Left]: [-5. -6. -3.]
New scores [Diagonal, Top, Left]: [-6. -7. -4.]
Highest score: -4.0

Cell: (7,1)
U—G
Old scores [Diagonal, Top, Left]: [-6. -7. -4.]
New scores [Diagonal, Top, Left]: [-7. -8. -5.]
Highest score: -5.0

Cell: (1,2)
G—A
Old scores [Diagonal, Top, Left]: [-1.  1. -2.]
New sc

In [10]:
grid

array([[ 0., -1., -2., -3., -4., -5., -6., -7.],
       [-1.,  1.,  0., -1., -2., -3., -4., -5.],
       [-2.,  0.,  0.,  1.,  0., -1., -2., -3.],
       [-3., -1., -1.,  0.,  2.,  1.,  0., -1.],
       [-4., -2., -2., -1.,  1.,  1.,  0., -1.],
       [-5., -3., -3., -1.,  0.,  0.,  0., -1.],
       [-6., -4., -2., -2., -1., -1.,  1.,  0.],
       [-7., -5., -3., -1., -2., -2.,  0.,  0.]])