In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_file(filename):
    with open(filename, 'r') as f:
        content = f.read()
        
    return content

In [3]:
def load_data():
    data = {}
    for index in range(3):
        X_train = pd.read_csv('Xtr%s_mat50.csv' % index, delimiter=' ', names=range(0,50)).values
        seq_train = pd.read_csv('Xtr%s.csv' % index, names='1').values
        X_test = pd.read_csv('Xte%s_mat50.csv' % index, delimiter=' ', names=range(0,50)).values
        seq_test = pd.read_csv('Xte%s.csv' % index, names='1').values
        labels = pd.read_csv('Ytr%s.csv' % index, names=('Id', 'Bound'), skiprows=[0], delimiter=',')

        data[index] = {
            'seq_train': seq_train,
            'X_train': X_train,
            'y_train': labels['Bound'].values,
            'seq_test': seq_test,
            'X_test': X_test,
            'ids': labels['Id'].values,
        }
        
    return data

DATA = load_data()

In [4]:
from data_manipulation import split_train_test_valid, get_precision

## Kernel ridge regression 

In [5]:
from ridge_regression import get_ridge_prediction, kernel_ridge_regression

In [6]:
train_X, train_y, test_X, test_y, valid_X, valid_y = split_train_test_valid(DATA[0]['X_train'], DATA[0]['y_train'])
train_X.shape

(1200, 50)

In [7]:
def linear_K(x1, x2):
    return x1 @ x2.T

In [8]:
REG_PARAMS_SPAN = [10**i for i in range(-10, 10)]

def get_best_reg_param():
    test_precisions = []
    for reg in REG_PARAMS_SPAN:
        alpha = kernel_ridge_regression(linear_K, train_X, train_y, reg)
        pred = get_ridge_prediction(linear_K, train_X, test_X, alpha)
        test_precisions.append(get_precision(pred, test_y))
        
    best_reg_index = max(range(len(REG_PARAMS_SPAN)), key=lambda x: test_precisions[x])
    return REG_PARAMS_SPAN[best_reg_index]

In [78]:
reg_param = get_best_reg_param()
alpha = kernel_ridge_regression(linear_K, train_X, train_y, reg_param)
pred = get_ridge_prediction(linear_K, train_X, valid_X, alpha)
get_precision(pred, valid_y)

0.56999999999999995

## Spectrum kernel 

In [9]:
train_seq, train_y, test_seq, test_y, valid_X, valid_y = split_train_test_valid(DATA[0]['seq_train'], DATA[0]['y_train'])

In [10]:
train_X, train_y, test_X, test_y, valid_X, valid_y = split_train_test_valid(DATA[0]['X_train'], DATA[0]['y_train'])
train_X.shape

(1200, 50)

In [11]:
from spectrum_kernel import transform_to_index_and_save

In [83]:
transform_to_index_and_save(3)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
0
100
200
300
400
500
600
700
800
900
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
0
100
200
300
400
500
600
700
800
900
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
0
100
200
300
400
500
600
700
800
900


In [12]:
def load_data_k(k):
    data = {}
    for index in range(3):
        X_train = np.loadtxt('spectral_preindexed/Xtr%s_spectral_%s.gz' % (index, k))
        seq_train = pd.read_csv('Xtr%s.csv' % index, names='1').values
        X_test = np.loadtxt('spectral_preindexed/Xte%s_spectral_%s.gz' % (index, k))
        seq_test = pd.read_csv('Xte%s.csv' % index, names='1').values
        labels = pd.read_csv('Ytr%s.csv' % index, names=('Id', 'Bound'), skiprows=[0], delimiter=',')

        data[index] = {
            'seq_train': seq_train,
            'X_train': X_train,
            'y_train': labels['Bound'].values,
            'seq_test': seq_test,
            'X_test': X_test,
            'ids': labels['Id'].values,
        }
        
    return data

In [13]:
k = 10
def spectrum_2(word1, word2):
    return sum(word2.count(word1[i:i+k]) for i in range(len(word1)-k+1))

def spectrum_kernel_2(seq1, seq2):
    n1 = seq1.shape[0]
    n2 = seq2.shape[0]

    K = np.zeros((n1,n2))
    for i in range(n1):
        for j in range(n2):
            K[i,j] = spectrum_2(seq1[i,0], seq2[j,0])
    
    return K

K = spectrum_kernel_2(train_seq, train_seq)

KeyboardInterrupt: 

In [39]:
np.sum(K[0,:])

110.0

In [14]:
def spectrum_kernel(x1, x2):        
    return x1 @ x2.T

In [15]:
K = spectrum_kernel(train_X, train_X)
K

array([[ 0.03284499,  0.02398393,  0.02398393, ...,  0.02374764,
         0.02197543,  0.02469282],
       [ 0.02398393,  0.03355388,  0.02433837, ...,  0.02327505,
         0.02587429,  0.02823724],
       [ 0.02398393,  0.02433837,  0.03119093, ...,  0.025638  ,
         0.02339319,  0.02941871],
       ..., 
       [ 0.02374764,  0.02327505,  0.025638  , ...,  0.03379017,
         0.02386578,  0.02398393],
       [ 0.02197543,  0.02587429,  0.02339319, ...,  0.02386578,
         0.0510397 ,  0.02752836],
       [ 0.02469282,  0.02823724,  0.02941871, ...,  0.02398393,
         0.02752836,  0.04347826]])

In [16]:
REG_PARAMS_SPAN = [10**i for i in range(-10, 10)]

def get_best_reg_param():
    test_precisions = []
    for reg in REG_PARAMS_SPAN:
        alpha = kernel_ridge_regression(spectrum_kernel, train_X, train_y, reg)
        pred = get_ridge_prediction(spectrum_kernel, train_X, test_X, alpha)
        test_precisions.append(get_precision(pred, test_y))
        
    best_reg_index = max(range(len(REG_PARAMS_SPAN)), key=lambda x: test_precisions[x])
    return REG_PARAMS_SPAN[best_reg_index], test_precisions[best_reg_index]

In [17]:
get_best_reg_param()

(0.1, 0.57750000000000001)

## Substring Kernel 

In [84]:
train_X, train_y, test_X, test_y, valid_X, valid_y = split_train_test_valid(DATA[0]['seq_train'], DATA[0]['y_train'])
train_X.shape

(1200, 1)

In [None]:
from substring_kernel import K_k, substring_kernel

In [None]:
LAMBDA = 0.1
K_k(LAMBDA, 2, 'car', 'car') - 2*LAMBDA**4+LAMBDA**6 < 10**-5

In [None]:
K_k(0.1, 5, 
   'CAGCTTTTATCACCTTTGAGGGAAAGTCATATTAATTTAATACTGCACACACTTGTACAACAGATCTTCTTTACTATTAAAACTCAGTTTATCAAATCACA',
   'AATAACATACCCCACTCTTTCATCTCAATCAAAAATTGAAAAAGTCAAAGAATCCTGCTTTTTTGTTTTTCTCCAAGCCATTACCCCCTCTTGATCATTGC'
   )

In [None]:
# This is too slow by several orders of magnitude...
for j in range(train_X.shape[0]):
    print(j)
    K_k(0.1, 2, train_X[1,0], train_X[j,0])

## Final results

In [18]:
DATA = load_data_k(6)

In [20]:
DATA[0]['X_train']

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [86]:
def produce_results(preds, ids):
    data = [
        '%s,%s' % (id, pred) for pred, id in zip(preds, ids)
    ]
    
    data.insert(0, 'Id,Bound')
    with open('submission.csv', 'w') as f:
        f.write('\n'.join(data))

In [87]:
def train_tune_and_pred_on_test():
    preds = []
    ids = []
    for k in range(3):
        train_X, train_y, test_X, test_y, valid_X, valid_y = split_train_test_valid(DATA[k]['X_train'], DATA[k]['y_train'])

        # Validation precision
        reg_param, _ = get_best_reg_param()
        alpha = kernel_ridge_regression(linear_K, train_X, train_y, reg_param)
        pred = get_ridge_prediction(linear_K, train_X, valid_X, alpha)
        print("Dataset %s has found a parameter with validation precision %.3f" % (k, get_precision(pred, valid_y)))
        
        # Kaggle submission
        alpha = kernel_ridge_regression(linear_K, DATA[k]['X_train'], DATA[k]['y_train'], reg_param)
        pred = get_ridge_prediction(linear_K, DATA[k]['X_train'], DATA[k]['X_test'], alpha)
        
        preds += list(pred)

    produce_results(preds, range(3000))

In [88]:
train_tune_and_pred_on_test()

TypeError: Object arrays are not currently supported

#### For sequences 

In [45]:
def get_best_reg_param(K_train, K_test):
    test_precisions = []
    for reg in REG_PARAMS_SPAN:
        alpha = kernel_ridge_regression(spectrum_kernel, train_X, train_y, reg, K=K_train)
        pred = get_ridge_prediction(spectrum_kernel, train_X, test_X, alpha, K_x=K_test)
        test_precisions.append(get_precision(pred, test_y))
        
    best_reg_index = max(range(len(REG_PARAMS_SPAN)), key=lambda x: test_precisions[x])
    return REG_PARAMS_SPAN[best_reg_index], test_precisions[best_reg_index]

def train_tune_and_pred_on_test():
    preds = []
    ids = []
    for k in range(3):
        train_seq, train_y, test_seq, test_y, valid_seq, valid_y = split_train_test_valid(DATA[k]['seq_train'], DATA[k]['y_train'])

        K_train = spectrum_kernel_2(train_seq, train_seq)
        print("train K done")
        K_test = spectrum_kernel_2(train_seq, test_seq)
        print("test K done")
        K_valid = spectrum_kernel_2(train_seq, valid_seq)
        print("valid K done")
        reg_param, _ = get_best_reg_param(K_train, K_test)
        
        # Validation precision
        alpha = kernel_ridge_regression(linear_K, train_X, train_y, reg_param, K=K_train)
        pred = get_ridge_prediction(linear_K, train_X, valid_X, alpha, K_x=K_test)
        print("Dataset %s has found a parameter with validation precision %.3f" % (k, get_precision(pred, valid_y)))

In [46]:
train_tune_and_pred_on_test()

train K done
test K done


KeyboardInterrupt: 