In [None]:
import pandas as pd
import numpy as np

path = "/home/ubuntu/onekgenomes/"
sampleDataFile = "data/sampleData/sampleData.tsv"
df = pd.read_csv(path + sampleDataFile, sep='\t', index_col=0)
pops = df["Population"].unique()
pops.sort()
n_pops = len(pops)
target_dict = dict(zip(pops, range(n_pops)))
Y = np.array([target_dict[p] for p in df["Population"].tolist()])
target_dict["GBR"]

In [None]:
pops = df["Population"].tolist()
uniquePops = list(set(pops))
uniquePops.sort()
numSamples = len(pops)
targetDict = dict(zip(uniquePops, range(numSamples)))
Y = np.array([targetDict[pop] for pop in pops])

In [2]:
import sys
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import KernelCenterer    #FIXME
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import numpy as np
import pandas as pd
import pdb

# Path to project directory.
path = "/home/ubuntu/onekgenomes/"

# Load pairwise distance.
D_sq_all = np.load(path + "data/pdist/summedMats/pdistAll.npy")    # Whole genome.
D_sq_num = np.load(path + "data/pdist/summedMats/pdistNum.npy")    # Excluding X and Y chromosomes.
n_samples = D_sq_all.shape[0]

# Compute linear kernels.
n = n_samples
K_all = -(np.eye(n) - 1 / n) @ (D_sq_all / 2) @ (np.eye(n) - 1 / n)
K_num = -(np.eye(n) - 1 / n) @ (D_sq_num / 2) @ (np.eye(n) - 1 / n)

# Shuffle.
np.random.seed(0)
shuffle = np.random.permutation(n_samples)
D_sq_all = (D_sq_all[shuffle, :])[:, shuffle]
D_sq_num = (D_sq_num[shuffle, :])[:, shuffle]
K_all = (K_all[shuffle, :])[:, shuffle]
K_num = (K_num[shuffle, :])[:, shuffle]

# Wrapper class for kernels.
class kernel_wrapper(BaseEstimator, TransformerMixin):
    def __init__(self, include_xy=True, gamma=1, kernel_type='linear', center_kernel=False):
        super(kernel_wrapper, self).__init__()
        self.include_xy = include_xy
        if kernel_type != 'linear' and kernel_type != 'rbf':
            raise ValueError("Invalid kernel type.")
        self.kernel_type = kernel_type
        self.gamma = gamma
        self.center_kernel = center_kernel

    def transform(self, test_ind):
        if self.kernel_type == 'linear':
            if self.include_xy:
                return (K_all[test_ind, :])[:, self.train_ind]
            else:
                return (K_num[test_ind, :])[:, self.train_ind]
        else:
#            if self.include_xy:
#                temp = (D_sq_all[test_ind, :])[:, self.train_ind]
#            else:
#                temp = (D_sq_num[test_ind, :])[:, self.train_ind]
            D_sq = D_sq_all if self.include_xy else D_sq_num
            K = np.exp(-self.gamma * D_sq)
            if self.center_kernel:
                K = KernelCenterer().fit_transform(K)
            return (K[test_ind, :])[:, self.train_ind]

    def fit(self, train_ind, y_train=None, **fit_params):
        self.train_ind = train_ind
        return self


# Dummy inputs.
X = np.arange(n_samples)

# Targets.
sampleDataFile = "data/sampleData/sampleData.tsv"
df = pd.read_csv(path + sampleDataFile, sep='\t', index_col=0)
pops = df["Population"].unique()
pops.sort()
n_pops = len(pops)
target_dict = dict(zip(pops, range(n_pops)))
y = np.array([target_dict[p] for p in df["Population"].tolist()])
y = y[shuffle]

# Test-train split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define pipeline.
pipeline = Pipeline([
    ('kern', kernel_wrapper()),
    ('svm', SVC()),
])

# Parameters for grid search.
"""cv_params = [
    dict([
        ('kern__kernel_type', ['linear']),
        ('kern__include_xy', [True, False]),
        ('svm__kernel', ['precomputed']),
        ('svm__C', 10.0 ** np.arange(-8, 1)),
    ]),
    dict([
        ('kern__kernel_type', ['rbf']),
        ('kern__include_xy', [True, False]),
        ('kern__gamma', 10.0 ** np.arange(-15, -1)),
        ('svm__kernel', ['precomputed']),
        ('svm__C', 10.0**np.arange(-2, 9)),
    ])
]
"""

cv_params = [
    dict([
        ('kern__kernel_type', ['rbf']),
        ('kern__include_xy', [True, False]),
        ('kern__gamma', 10.0 ** np.arange(-15, -1)),
        ('kern__center_kernel', [True]),
        ('svm__kernel', ['precomputed']),
        ('svm__C', 10.0**np.arange(-2, 9)),
    ])
]
# Grid search.
model = GridSearchCV(pipeline, cv_params, cv=5, verbose=1, n_jobs=-1,
                     iid=True, return_train_score=True)
model.fit(X_train, y_train)

# Display results.
print("Best params:")
print(model.best_params_)

print("Best scores:")
print(model.best_score_)

Fitting 5 folds for each of 308 candidates, totalling 1540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 1540 out of 1540 | elapsed: 10.8min finished


Best params:
{'kern__center_kernel': True, 'kern__gamma': 1e-14, 'kern__include_xy': False, 'kern__kernel_type': 'rbf', 'svm__C': 100000000.0, 'svm__kernel': 'precomputed'}
Best scores:
0.8627059410883674


In [7]:
import pandas as pd
rdf = pd.DataFrame.from_dict(model.cv_results_)
ldf = rdf[rdf['param_kern__kernel_type'] == 'rbf']
ldf = ldf.sort_values(by=['mean_test_score'], ascending=False)
pd.set_option('display.max_rows', 500)
ldf[['param_svm__C', 'param_kern__gamma', 'mean_test_score', 'mean_train_score']]


Unnamed: 0,param_svm__C,param_kern__gamma,mean_test_score,mean_train_score
147,1000000.0,1e-10,0.862706,1.0
148,10000000.0,1e-10,0.862706,1.0
146,100000.0,1e-10,0.862706,1.0
145,10000.0,1e-10,0.862706,1.0
166,1000.0,1e-09,0.862706,1.0
167,10000.0,1e-09,0.862706,1.0
168,100000.0,1e-09,0.862706,1.0
169,1000000.0,1e-09,0.862706,1.0
127,100000000.0,1e-11,0.862706,1.0
126,10000000.0,1e-11,0.862706,1.0


In [2]:
y_pred = model.predict(X_test)
acc_test = accuracy_score(y_test, y_pred)

print("Test accuracy: {}".format(acc_test))

Test accuracy: 0.874251497005988


In [None]:
model.cv_results_

In [None]:
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score


# Computes the RBF kernel from the pairwise distance matrix.
def (D_sq, gamma):
    return np.exp(-gamma * D_sq)



pdistFile = path + "data/pdist/summedMats/pdistAll.npy"
gamma = 1 / 2929616
K = laplacianKernel(gamma, pdistFile)
numSamples = K.shape[0]


# Compute linear kernel.
from numpy.linalg import svd

pdistFile = path + "data/pdist/summedMats/pdistAll.npy"
D_sq = np.load(pdistFile)
numSamples = K.shape[0]
n = numSamples
K = -(np.eye(n) - np.ones((n, n)) / n) @ D_sq @ (np.eye(n) - np.ones((n, n)) / n)



# Randomly permute data.
np.random.seed(0)
randPerm = np.random.permutation(numSamples)
K = K[randPerm, :]
K = K[:, randPerm]
Y = Y[randPerm]

# Define classifier with precomputed kernel.
svc = SVC(kernel='precomputed')

# Training and test sets.
numTrain = round(0.8 * numSamples)
K_train, K_test = K[:numTrain,:numTrain], K[numTrain:,:numTrain]
Y_train, Y_test = Y[:numTrain], Y[numTrain:]


svc.fit(K_train, Y_train)

Y_pred = svc.predict(K_test)
print('accuracy score: %0.3f' % accuracy_score(Y_test, Y_pred))

In [None]:
from sklearn.metrics.pairwise import chi2_kernel
X1 = [[0, 1], [1, 0], [.2, .8], [.7, .3]]
X2 = [[1, 0], [0, 1], [0.5, 0.5], [0.4, 0.6], [0.3, 0.7]]
chi2_kernel(X1, X2, gamma=.5).shape

In [None]:
import numpy as np
np.random.seed(0)
t = np.random.randn(3,5)
ind1 = np.array([1,2])
ind2 = np.array([2,4])
(t[ind1, :])[:,ind2]

In [None]:
# Wrapper class for linear kernel.
class linear_kernel(BaseEstimator, TransformerMixin):
    def __init__(self, include_xy=True):
        super(linear_kernel, self).__init__()
        self.include_xy = include_xy

    def transform(self, test_ind):
        D_sq = D_sq_all if self.include_xy else D_sq_num
        K = -(np.eye(n) - 1 / n) @ D_sq @ (np.eye(n) - 1 / n)
        return (K[test_ind, :])[:, self.train_ind]

    def fit(self, train_ind, y_train=None, **fit_params):
        self.train_ind = train_ind
        return self

# Wrapper class for RBF kernel.
class rbf_kernel(BaseEstimator, TransformerMixin):
    def __init__(self, include_xy=True, gamma=1):
        super(rbf_kernel, self).__init__()
        self.gamma = gamma
        self.include_xy = include_xy

    def transform(self, test_ind):
        D_sq = D_sq_all if self.include_xy else D_sq_num
        temp = (D_sq[test_ind, :])[:, self.train_ind]        
        return np.exp(-self.gamma * temp)

    def fit(self, train_ind, y_train=None, **fit_params):
        self.train_ind = train_ind
        return self
