## Algorithm 2 SGT features extractor


In [141]:
import numpy as np

def getpositions(S, V):
    
    '''
    compute index position of sequence S within V
    
    sequence S , space set V
    
    return list of tuples [(value, position)]
    
            [(209981, (array([8]),))(320033, (array([6]),)]
    '''
    
    positions = [(v, np.where(S==v)) for v in V if v in S]
    
    return positions
    
    
def sgt(S, V, ls, k =1):
    
    '''
    
    Extract Sequence Graph Transform features algorithm 2
    
    
    S: sequence 
    V : set domain of all values
    ls: is length sensitive 
    k: hyperparameter  defaults to 1 for supervised learning typically selected κ from {1, 5, 10}
    
    return: sgt matrix 
    
    '''
    
    size  = V.shape[0]
    l = 0
    W0, Wk = np.zeros((size,size)),  np.zeros((size,size))
    positions = getpositions(S,V)
    
    for i, u in enumerate(V):
        try:
            index = [p[0] for p in positions].index(u)
    
        except ValueError:
            # move to next element
            break
        
        U = np.array(positions[index][1]).ravel()
        
        for j, v in enumerate(V):
            
            try:
                index = [p[0] for p in positions].index(v)
            except ValueError:
                # move to next element
                break
            
            V2 = np.array(positions[index][1]).ravel()
        
            C = [(i,j) for i in U for j in V2 if j > i]
            
            W0[i,j] = len(C)
        
            cu = np.array([i[0] for i in C]) 
       
            cv = np.array([i[1] for i in C]) 
       
            Wk[i,j] = np.sum(np.exp(-k * np.abs(cu - cv)))
        
        l += U.shape[0]
    
    if ls:
        W0 = np.divide(W0, l)
        
    print('Kappa')
    print(Wk)
    
    print('Theta')
    print(W0)
    
    temp = np.divide(Wk, W0)
    
    sgt = np.power(temp, (1/k))
   
  
    
    
    return sgt

## Test validation 

In [143]:
sgt = sgt(S = np.array(["B","B","A","C","A","C","A","A","B","A"]), V = np.array(["A", "B", "C"]), ls = True, k =5)
sgt

Kappa
[[  6.87476068e-03   6.78334899e-03   1.34761999e-02]
 [  1.35216019e-02   6.73794700e-03   4.57079071e-05]
 [  1.35216040e-02   3.05916208e-07   4.53999298e-05]]
theta
[[ 1.   0.4  0.3]
 [ 1.1  0.3  0.4]
 [ 0.7  0.2  0.1]]


array([[ 0.36936141,  0.44246287,  0.53763711],
       [ 0.41488439,  0.46803816,  0.16277446],
       [ 0.45413611,  0.06869332,  0.21449197]])