In [49]:
import pandas as pd
from collections import deque
from tqdm import tqdm

class DataHandler():
    
    def __init__(self, fname):
        self.X = pd.read_csv(fname)['seq']
        self.data = self.X
        self.kmer_set = {}
        self.neigborhoods = {}
        
        self.alph = "GATC"
        self.precomputed = {}
        
    def spectrum_preprocess(self, k):
        n = self.X.shape[0]
        d = len(self.X[0])
        embedding = [{} for x in self.X]
        print("Computing kmer embedding")
        for i,x in enumerate(tqdm(self.X)):
            for j in range(d - k + 1):
                kmer = x[j: j + k]
                if kmer in embedding[i]:
                    embedding[i][kmer] += 1
                else:
                    embedding[i][kmer] = 1
        self.data = embedding
        
        
    def populate_kmer_set(self, k):
        d = len(self.X[0])
        idx = 0
        print("Populating kmer set")
        for x in tqdm(self.X):
            for j in range(d - k + 1):
                kmer = x[j: j + k]
                if kmer not in self.kmer_set:
                    self.kmer_set[kmer] = idx
                    idx +=1  
            
    def mismatch_preprocess(self, k, m):
        n = self.X.shape[0]
        d = len(self.X[0])
        embedding = [{} for x in self.X]
        print("Computing mismatch embedding")
        for i,x in enumerate(tqdm(self.X)):
            for j in range(d - k + 1):
                kmer = x[j: j + k]
                if kmer not in self.precomputed:
                    Mneighborhood = self.m_neighborhood(kmer, m)
                    self.precomputed[kmer] = [self.kmer_set[neighbor] for neighbor in Mneighborhood if neighbor in self.kmer_set]
                    
                for idx in self.precomputed[kmer]:
                    if idx in embedding[i]:
                        embedding[i][idx] += 1
                    else:
                        embedding[i][idx] = 1
        self.data = embedding
            
    def m_neighborhood(self, kmer, m):
        mismatch_list = deque([(0, "")])
        for letter in kmer:
            num_candidates = len(mismatch_list)
            for i in range(num_candidates):
                mismatches, candidate = mismatch_list.popleft()
                if mismatches < m :
                    for a in self.alph:
                        if a == letter :
                            mismatch_list.append((mismatches, candidate + a))
                        else:
                            mismatch_list.append((mismatches + 1, candidate + a))
                if mismatches == m:
                    mismatch_list.append((mismatches, candidate + letter))
        return [candidate for mismatches, candidate in mismatch_list]
                
        


In [50]:
import numpy as np
from tqdm import tqdm

class Kernel():
   
    def gaussian(sigma):
        return lambda x, y : 1/(np.sqrt(2*np.pi)*sigma) * np.exp(-np.linalg.norm(x - y)**2/(2*sigma**2))
    
    def linear():
        return lambda x, y: np.dot(x, y)
    
    def polynomial(c, n):
        return lambda x, y : (np.dot(x, y) + c)**n
    
    def spectrum():
        def f(x, y):
            prod_scal = 0
            for kmer in x:
                if kmer in y:
                    prod_scal += x[kmer]*y[kmer]
            return prod_scal
        return f
    
    def mismatch():
        def f(x, y):
            prod_scal = 0
            for idx in x:
                if idx in y:
                    prod_scal += x[idx]*y[idx]
            return prod_scal
        return f
    
    def sparse_gaussian(sigma):
        def f(x, y):
            ps = Kernel.mismatch()
            norm = ps(x, x) - 2*ps(x, y) + ps(y,y)
            return 1/(np.sqrt(2*np.pi)*sigma) * np.exp(-norm/(2*sigma**2))
        return f
    
    def sparse_poly(c, n):
        def f(x, y):
            ps = Kernel.mismatch()
            return (ps(x,y) + c)**n
        return f
    
    def __init__(self, func, normalized = False):
        self.kernel = func
        self.normalized = normalized
        self.diag = np.array([])
        
    def gram(self, data):
        n = len(data)
        K = np.zeros((n, n))
        print("Computing Gram Matrix")
        for i in tqdm(range(n)):
            for j in range(i+1):
                prod_scal = self.kernel(data[i], data[j])
                K[i, j] = prod_scal
                K[j, i] = prod_scal
        
        if self.normalized:
            self.diag = np.sqrt(np.diag(K))
            print(self.diag.shape)
            for i in range(n):
                K[i, :] = K[i,:]/self.diag[i]
                K[:, i] = K[:, i]/self.diag[i]
            
        return K
    
    def eval_f(self, x, alpha, data):
        if self.normalized:
            square_norm_x = np.sqrt(self.kernel(x, x))
            result = np.sum([(alpha[i]*self.kernel(x, xi))/(square_norm_x * self.diag[i]) for i, xi in enumerate(data)])
        else:
            result =  np.sum([alpha[i]*self.kernel(x, xi) for i, xi in enumerate(data)])
        return result 


In [51]:
import numpy as np
#from LargeMargin import LargeMargin
#from Kernel import Kernel
from tqdm import tqdm
import pandas as pd

def project(v):
        
        mu = list(v)
        mu.sort()
        cumul_sum = np.cumsum(mu)
        rho = np.max([j for j in range(0, len(mu)) if mu[j] - 1/(j+1)*(cumul_sum[j] - 1) > 0])
        
        theta = 1/(rho+1)*(cumul_sum[rho] - 1)
        return np.array([max(0, vi - theta) for vi in v])

def MKL(kernels, y, lmda, T):
    
    m = len(kernels)
    d = np.array([1/m for k in range(m)])
    
    for t in range(T):
        
        K = np.zeros_like(kernels[0])
        for i, Km in enumerate(kernels):
            K = K + d[i]*Km
        
        alpha = LargeMargin.SVM(K, y, lmda) #Resoud pour la somme
        
        
        grad = [-0.5*lmda*np.dot(alpha.T, np.dot(Km, alpha))[0][0] for Km in kernels]
        step = 0.01
        d = project(d - step*np.array(grad)) #Projette le gradient sur le simplexe
    
    return d

In [52]:
from cvxopt import solvers, matrix, spmatrix, sparse
import numpy as np

class LargeMargin():
    
    def SVM(K, y, lmda):
    
        print("Optimizing")
    
        solvers.options['show_progress'] = False
    
        n = len(y)
        q = -matrix(y, (n, 1), tc='d')
        h = matrix(np.concatenate([np.ones(n)/(2*lmda*n), np.zeros(n)]).reshape((2*n, 1)))
        P = matrix(K)
        Gtop = spmatrix(y, range(n), range(n))
        G = sparse([Gtop, -Gtop])

    
        sol = solvers.qp(P, q, G, h)['x']
    
        return sol


In [53]:
import numpy as np
import pandas as pd
#from LargeMargin import LargeMargin
from tqdm import tqdm



def write_predictions(predictions, out_fname):
    
    data = [[int(np.abs((pred+1)//2))] for i, pred in enumerate(predictions)]
    data = np.concatenate([[['Bound']], data])
    
                
    data_frame = pd.DataFrame(data=data[1:,:], columns=data[0])
    data_frame.index.name = 'Id'
    data_frame.to_csv(out_fname)
    
    
def kernel_train(kernel, training_data, ytrain, lmda):
    
    K = kernel.gram(training_data)
    alpha = LargeMargin.SVM(K, ytrain, lmda)
    return alpha

def kernel_predict(kernel, alpha, training, test):
    
    predict = []
    for x in tqdm(test):
        predict.append(np.sign(kernel.eval_f(x, alpha, training)))
    return predict

def score(predict, yreal):
    
    return sum([int(predict[i]==yreal[i]) for i in range(len(yreal))])/len(yreal)

def split_data(dataset, y, k, m):
    
    dataset.populate_kmer_set(k)
    dataset.mismatch_preprocess(k, m)
    idx = range(len(dataset.data))
    pairs = []
    data_tranches = [idx[500*i : 500*i+ 500] for i in range(4)]
    label_tranches = [y[500*i: 500*i + 500] for i in range(4)]
    for i in range(4):
        test, ytest = data_tranches[i], label_tranches[i]
        train = np.concatenate([data_tranches[j] for j in range(4) if j != i])
        ytrain = np.concatenate([label_tranches[j] for j in range(4) if j != i])
        
        pairs.append((train, ytrain, test, ytest))
    return pairs

In [54]:
import numpy as np
import pandas as pd
from tqdm import tqdm


# from DataHandler import DataHandler
# from LargeMargin import LargeMargin
# from Kernel import Kernel
# from utils import kernel_train, kernel_predict, write_predictions



print('''
------------------------------------------------------------
        DATASET 0
------------------------------------------------------------
''')

fname = ''
dataset = DataHandler('Xtr.csv')

labels = pd.read_csv('Ytr.csv')
print(len(labels['Bound']))
y = 2.0*np.array(labels['Bound']) - 1

test = DataHandler('Xte.csv')

dataset.X = pd.concat([dataset.X, test.X], axis = 0, ignore_index = True)


dataset.populate_kmer_set(k = 9)
dataset.mismatch_preprocess(k=9, m=1)
K9 = Kernel(Kernel.mismatch()).gram(dataset.data)

dataset.populate_kmer_set(k = 10)
dataset.mismatch_preprocess(k=10, m=1)
K10 = Kernel(Kernel.mismatch()).gram(dataset.data)

dataset.populate_kmer_set(k = 11)
dataset.mismatch_preprocess(k=11, m=1)
K11 = Kernel(Kernel.mismatch()).gram(dataset.data)


K = K9 + K10 + K11

training = [i for i in range(2000)]
testing = [i for i in range(2000, 3000)]

lmda = 0.0000001


alpha = LargeMargin.SVM(K[training][:, training], y, lmda)

pred0 = []
for i in tqdm(testing):
    val = 0
    for k, j in enumerate(training):
        val += alpha[k]*K[i, j]
    pred0.append(np.sign(val))
    #pred0 = np.array(pred0)


# print('''
# ------------------------------------------------------------
#         DATASET 1
# ------------------------------------------------------------
# ''')

# fname = '1'
# dataset = DataHandler('Data/Xtr'+fname+'.csv')

# labels = pd.read_csv('data/Ytr'+fname+'.csv')
# y = 2.0*np.array(labels['Bound']) - 1

# test = DataHandler('data/Xte'+fname+'.csv')


# dataset.X = pd.concat([dataset.X, test.X], axis = 0, ignore_index = True)


# dataset.populate_kmer_set(k = 9)
# dataset.mismatch_preprocess(k=9, m=1)
# K9 = Kernel(Kernel.mismatch()).gram(dataset.data)

# dataset.populate_kmer_set(k = 10)
# dataset.mismatch_preprocess(k=10, m=1)
# K10 = Kernel(Kernel.mismatch()).gram(dataset.data)

# dataset.populate_kmer_set(k = 11)
# dataset.mismatch_preprocess(k=11, m=1)
# K11 = Kernel(Kernel.mismatch()).gram(dataset.data)


# K = K9 + K10 + K11

# training = [i for i in range(2000)]
# testing = [i for i in range(2000, 3000)]

# lmda = 0.833


# alpha = LargeMargin.SVM(K[training][:, training], y, lmda)

# pred1 = []
# for i in tqdm(testing):
#     val = 0
#     for k, j in enumerate(training):
#         val += alpha[k]*K[i, j]
#     pred1.append(np.sign(val))


# print('''
# ------------------------------------------------------------
#         DATASET 2
# ------------------------------------------------------------
# ''')

# fname = ''
# dataset = DataHandler('data/Xtr'+fname+'.csv')

# labels = pd.read_csv('data/Ytr'+fname+'.csv')
# y = 2.0*np.array(labels['Bound']) - 1

# test = DataHandler('data/Xte'+fname+'.csv')

    
# dataset.populate_kmer_set(12)
# test.kmer_set = dataset.kmer_set

# dataset.mismatch_preprocess(12 , 0)
# test.mismatch_preprocess(12, 0)

# kernel = Kernel(Kernel.sparse_gaussian(7.8))


# lmda = 0.00000001

# alpha = kernel_train(kernel, dataset.data, y, lmda)
# pred2 = kernel_predict(kernel, alpha, dataset.data, test.data)


print('''
------------------------------------------------------------
        KAGGLEIZER
------------------------------------------------------------
''')

out_fname = "Yte.csv"
#pred0 = np.array(pred0, dtype=np.int)
predictions = pred0 
for i in range(len(predictions)):
    if predictions[i] < 0:
        predictions[i] = 0
    else:
        predictions[i] = 1
        
        
    
print(len(predictions))
# test['Bound'] = predictions
# subm = test[['Id','Bound']]

write_predictions(predictions, out_fname)


  0%|          | 0/3000 [00:00<?, ?it/s]


------------------------------------------------------------
        DATASET 0
------------------------------------------------------------

2000
Populating kmer set


100%|██████████| 3000/3000 [00:00<00:00, 32595.60it/s]
  1%|          | 25/3000 [00:00<00:11, 249.56it/s]

Computing mismatch embedding


100%|██████████| 3000/3000 [00:06<00:00, 488.55it/s]
  2%|▏         | 47/3000 [00:00<00:06, 468.18it/s]

Computing Gram Matrix


100%|██████████| 3000/3000 [06:16<00:00,  7.98it/s]
100%|██████████| 3000/3000 [00:00<00:00, 28696.40it/s]
  0%|          | 0/3000 [00:00<?, ?it/s]

Populating kmer set
Computing mismatch embedding


100%|██████████| 3000/3000 [00:08<00:00, 343.01it/s]
  2%|▏         | 53/3000 [00:00<00:05, 526.20it/s]

Computing Gram Matrix


100%|██████████| 3000/3000 [03:31<00:00, 14.17it/s]
100%|██████████| 3000/3000 [00:00<00:00, 27152.34it/s]
  0%|          | 0/3000 [00:00<?, ?it/s]

Populating kmer set
Computing mismatch embedding


100%|██████████| 3000/3000 [00:11<00:00, 262.36it/s]
  3%|▎         | 85/3000 [00:00<00:03, 843.27it/s]

Computing Gram Matrix


100%|██████████| 3000/3000 [01:45<00:00, 28.34it/s]


Optimizing


100%|██████████| 1000/1000 [00:01<00:00, 918.14it/s]


------------------------------------------------------------
        KAGGLEIZER
------------------------------------------------------------

1000





In [55]:
df = pd.DataFrame({'Bound': predictions,
                   'Id': np.arange(1000)})
df = df[['Id','Bound']]

In [56]:
df.to_csv("ModelPredictons.csv",index = False)