# SVM with RBF Kernel for Classification of Ethnic Origin Based on Genome

In [2]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

class RBFKernel(TransformerMixin, BaseEstimator):
    def __init__(self, sigma=1):
        super(RBFKernel, self).__init__()
        self.sigma = sigma
        
    def transform(self, D_sq, **transform_params):
        return np.exp(-D_sq / (2 * self.sigma ** 2))

    def fit(self, **fit_params):
        return self


# Define the pipeline.
pipeline = Pipeline([
    ("rbf_kernel", RBFKernel()),
    ("svm", SVC(kernel="precomputed")),
])

# Suppose there is some code here to load the pairwise distance matrix and ground truth.
# Load ground truth.
PATH = "/home/ubuntu/one-k-genomes/"
sample_data_file = "{:s}data/sample_data/sample_data.tsv".format(PATH)
df = pd.read_csv(sample_data_file, sep='\t', index_col=0)

# Get list of populations.
pops = df["Population"].tolist()
unique_pops = list(set(pops))
unique_pops.sort()
num_pops = len(pops)

# Create population dictionaries and ground truth.
pop_to_num = dict(zip(unique_pops, range(num_pops)))
num_to_pop = dict(zip(range(num_pops), unique_pops))
Y = np.array([pop_to_num[pop] for pop in pops])

# Get dictionary for population descriptions.
population_file = "{:s}data/sample_data/populations.tsv".format(PATH)
df = pd.read_csv(population_file, sep='\t')
code_to_descrip = dict(zip(
    df["Population Code"].tolist(),
    df["Population Description"].tolist(),
))

# Pairwise distance.
pdist_file = "{:s}data/pdist/summed_mats/pdist_num.npy".format(PATH)
D_sq = np.load(pdist_file)
m = D_sq.shape[0]

# Create an 80-20 train-test split.
m = D_sq.shape[0]
num_train = round(0.2 * m)
D_sq_train = D_sq[:num_train, :num_train]
D_sq_test = D_sq[num_train:, :num_train]
Y_train = Y[:num_train]
Y_test = Y[num_train:]

# Train the pipeline and predict on the test set.
pipeline.set_params(rbf_kernel__sigma=1, svm__C=1).fit(D_sq_train, Y_train)
Y_pred = pipeline.predict(D_sq_test)
accuracy = accuracy_score(Y_pred, Y_test)

TypeError: fit() takes 1 positional argument but 3 were given

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

## Ground Truth

In [None]:
# Load ground truth.
PATH = "/home/ubuntu/one-k-genomes/"
sample_data_file = "{:s}data/sample_data/sample_data.tsv".format(PATH)
df = pd.read_csv(sample_data_file, sep='\t', index_col=0)

# Get list of populations.
pops = df["Population"].tolist()
unique_pops = list(set(pops))
unique_pops.sort()
num_pops = len(pops)

# Create population dictionaries and ground truth.
pop_to_num = dict(zip(unique_pops, range(num_pops)))
num_to_pop = dict(zip(range(num_pops), unique_pops))
Y = np.array([pop_to_num[pop] for pop in pops])

# Get dictionary for population descriptions.
population_file = "{:s}data/sample_data/populations.tsv".format(PATH)
df = pd.read_csv(population_file, sep='\t')
code_to_descrip = dict(zip(
    df["Population Code"].tolist(),
    df["Population Description"].tolist(),
))

## Compute RBF Kernel

The RBF kernel \\(K\\) is computed from the pairwise distance matrix \\(D_{sq}\\) by the formula

\\[
K = \exp\left(-\frac{D_{sq}}{2 \sigma^2}\right),
\\]

where \\(\sigma > 0\\) is a parameter.

In [None]:
class KernelWrapper(TransformerMixin, BaseEstimator):
    def __init__(self, sigma=1):
        super(KernelWrapper, self).__init__()
        self.sigma = sigma
        
    def transform(self, D_sq, **transform_params):
        return np.exp(-D_sq / (2 * self.sigma ** 2))

    def fit(self, **fit_params):
        return self

In [None]:
kern = KernelWrapper(sigma=1)
temp = np.array([[1,2], [3,4]])
temp_array = np.array([[5, 6]])
test = kern.fit()
kern.transform(temp)

## Define pipeline.

In [None]:
pipeline = Pipeline([
    ("kern", KernelWrapper()),
    ("svm", SVC(kernel="precomputed")),
])

In [None]:
# Compute the linear kernel from the pairwise distance matrix.
pdist_file = "{:s}data/pdist/summed_mats/pdist_num.npy".format(PATH)
D_sq = np.load(pdist_file)
m = D_sq.shape[0]

cv_params = [
    dict([
        ('kern__sigma', 10.0 ** np.arange(-15, -1)),
#        ('svm__kernel', ['precomputed']),
        ('svm__C', 10.0**np.arange(-2, 9)),
    ])
]

## Train-Test Split

Use an 80-20 train-test split.

In [None]:
# Randomly permute data.
np.random.seed(0)
rand_perm = np.random.permutation(m)
D_sq = D_sq[rand_perm, :]
D_sq = D_sq[:, rand_perm]
Y = Y[rand_perm]

# Create 80-20 train-test split.
num_test = round(0.2 * m)
num_train = m - num_test
D_sq_train = D_sq[:num_train, :num_train]
D_sq_test = D_sq[num_train:, :num_train]
Y_train = Y[:num_train]
Y_test = Y[num_train:]

In [None]:
pipeline.fit(D_sq_train, Y_train)

In [None]:
classifier = GridSearchCV(pipeline, cv_params, cv=5, verbose=1, n_jobs=-1,
                     iid=True, return_train_score=True)
classifier.fit(D_sq_train, Y_train)

## Cross-Validation and Grid Search

Search for optimal value of the regularization constant \\(C\\) using five-fold cross-validation and iterative grid search.

In [None]:
#param_grid = {'C': 10.0 ** np.arange(-6,4)}
param_grid = {'C': np.linspace(1.0, 2.0, 11) * 1e-6}
classifier = GridSearchCV(SVC(kernel="precomputed"),
                          cv=5,
                          param_grid=param_grid,
                          iid=True,
                          return_train_score=True)
classifier.fit(K_train, Y_train)

## View Results of Grid Search

I chose the smallest value of \\(C\\) that maximized the mean validation accuracy (overfitting).

In [None]:
results_df = pd.DataFrame.from_dict(classifier.cv_results_)
results_df = results_df[[
    "param_C",
    "mean_train_score",
    "mean_test_score"
]]
results_df.sort_values(
    by=["mean_test_score"],
    ascending=False,
    inplace=True,
)
results_df

## Train Model

In [None]:
C = 1.1e-6    # Regularization parameter.
svc = SVC(kernel="precomputed", C=C)
svc.fit(K_train, Y_train)
Y_pred = svc.predict(K_test)
test_accuracy = accuracy_score(Y_pred, Y_test)
Y_train_pred = svc.predict(K_train)
train_accuracy = accuracy_score(Y_train_pred, Y_train)

print("Test Accuracy: {:1.4f}".format(test_accuracy))
print("Train Accuracy: {:1.4f}".format(train_accuracy))

## Error Analysis

In [None]:
incorrect_ind = ~(Y_pred == Y_test)
predicted = [code_to_descrip[num_to_pop[x]] for x in Y_pred[incorrect_ind]]
ground_truth = [code_to_descrip[num_to_pop[x]] for x in Y_test[incorrect_ind]]
df_ = pd.DataFrame({
    "Pred": predicted,
    "GT": ground_truth,
})
swap_ind = df_["Pred"] > df_["GT"]
df_.loc[swap_ind, ["Pred", "GT"]] = df_.loc[swap_ind,["GT", "Pred"]].values
df_ = pd.DataFrame(df_.groupby(["Pred", "GT"]).size()).reset_index()
df_.columns = ["Class 1", "Class 2", "Error Count"]
df_.sort_values(by="Error Count", ascending=False, inplace=True)
df_

Some class labels are not mutually exclusive:

- British in England and Scotland and Utah Residents with Northern and Western European Ancestry
- African Ancestry in the Southwest US and African Caribbean in Barbados
- African Caribbean in Bardados and Yoruba in Nigeria
- Han Chinese in Beijing and Southern Han Chinese

These four groups account for ~79% of the error on the test set.