# SVM with Linear Kernel for Classification of Ethnic Origin Based on Genome

In [1]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
import pandas as pd
import numpy as np

## Ground Truth

In [2]:
# Load ground truth.
PATH = "/home/ubuntu/one-k-genomes/"
sample_data_file = "{:s}data/sample_data/sample_data.tsv".format(PATH)
df = pd.read_csv(sample_data_file, sep='\t', index_col=0)

# Get list of populations.
pops = df["Population"].tolist()
unique_pops = list(set(pops))
unique_pops.sort()
num_pops = len(pops)

# Create population dictionaries and ground truth.
pop_to_num = dict(zip(unique_pops, range(num_pops)))
num_to_pop = dict(zip(range(num_pops), unique_pops))
Y = np.array([pop_to_num[pop] for pop in pops])

# Get dictionary for population descriptions.
population_file = "{:s}data/sample_data/populations.tsv".format(PATH)
df = pd.read_csv(population_file, sep='\t')
code_to_descrip = dict(zip(
    df["Population Code"].tolist(),
    df["Population Description"].tolist(),
))

## Compute Linear Kernel

The linear kernel \\(K\\) can be computed from the pairwise distance matrix \\(D_{sq}\\) by the formula

\\[
K = -\left(I_m - \frac{\mathbf 1_{m \times m}}{m}\right) \frac{D_{sq}}{2} \left(I_m - \frac{\mathbf 1_{m \times m}}{m}\right).
\\]

In [3]:
# Compute the linear kernel from the pairwise distance matrix.
pdist_file = "{:s}data/pdist/summed_mats/pdist_num.npy".format(PATH)
D_sq = np.load(pdist_file)
m = D_sq.shape[0]
A = np.eye(m) - np.ones((m, m)) / m
K = - A @ (D_sq / 2) @ A

## Train-Test Split

Use an 80-20 train-test split.

In [4]:
# Randomly permute data.
np.random.seed(0)
rand_perm = np.random.permutation(m)
K = K[rand_perm, :]
K = K[:, rand_perm]
Y = Y[rand_perm]

# Create 80-20 train-test split.
num_test = round(0.2 * m)
num_train = m - num_test
K_train = K[:num_train, :num_train]
K_test = K[num_train:, :num_train]
Y_train = Y[:num_train]
Y_test = Y[num_train:]

## Cross-Validation and Grid Search

Search for optimal value of the regularization constant \\(C\\) using five-fold cross-validation and iterative grid search.

In [17]:
#param_grid = {'C': 10.0 ** np.arange(-6,4)}
param_grid = {'C': np.linspace(7.0, 8.0, 11) * 1e-7}
classifier = GridSearchCV(SVC(kernel="precomputed"),
                          cv=5,
                          param_grid=param_grid,
                          iid=True,
                          return_train_score=True)
classifier.fit(K_train, Y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='precomputed', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=None,
       param_grid={'C': array([7.0e-07, 7.1e-07, 7.2e-07, 7.3e-07, 7.4e-07, 7.5e-07, 7.6e-07,
       7.7e-07, 7.8e-07, 7.9e-07, 8.0e-07])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

## View Results of Grid Search

I chose the smallest value of \\(C\\) that maximized the mean validation accuracy (overfitting).

In [18]:
results_df = pd.DataFrame.from_dict(classifier.cv_results_)
results_df = results_df[[
    "param_C",
    "mean_train_score",
    "mean_test_score"
]]
results_df.sort_values(
    by=["mean_test_score"],
    ascending=False,
    inplace=True,
)
results_df

Unnamed: 0,param_C,mean_train_score,mean_test_score
9,7.9e-07,1.0,0.884673
10,8e-07,1.0,0.884174
8,7.8e-07,1.0,0.883674
7,7.7e-07,0.999874,0.883175
5,7.5e-07,0.999874,0.882676
6,7.6e-07,0.999874,0.882177
4,7.4e-07,0.999874,0.87968
3,7.3e-07,0.999874,0.878682
1,7.1e-07,0.999874,0.877184
2,7.2e-07,0.999874,0.876685


## Train Model

In [19]:
C = 7.9e-7    # Regularization parameter.
svc = SVC(kernel="precomputed", C=C)
svc.fit(K_train, Y_train)
Y_pred = svc.predict(K_test)
test_accuracy = accuracy_score(Y_pred, Y_test)
Y_train_pred = svc.predict(K_train)
train_accuracy = accuracy_score(Y_train_pred, Y_train)

print("Test Accuracy: {:1.4f}".format(test_accuracy))
print("Train Accuracy: {:1.4f}".format(train_accuracy))

Test Accuracy: 0.9301
Train Accuracy: 1.0000


## Error Analysis

In [20]:
incorrect_ind = ~(Y_pred == Y_test)
predicted = [code_to_descrip[num_to_pop[x]] for x in Y_pred[incorrect_ind]]
ground_truth = [code_to_descrip[num_to_pop[x]] for x in Y_test[incorrect_ind]]
df_ = pd.DataFrame({
    "Pred": predicted,
    "GT": ground_truth,
})
swap_ind = df_["Pred"] > df_["GT"]
df_.loc[swap_ind, ["Pred", "GT"]] = df_.loc[swap_ind,["GT", "Pred"]].values
df_ = pd.DataFrame(df_.groupby(["Pred", "GT"]).size()).reset_index()
df_.columns = ["Class 1", "Class 2", "Error Count"]
df_.sort_values(by="Error Count", ascending=False, inplace=True)
df_

Unnamed: 0,Class 1,Class 2,Error Count
0,African Ancestry in Southwest US,African Caribbean in Barbados,10
10,Indian Telugu in the UK,Sri Lankan Tamil in the UK,7
3,"Colombian in Medellin, Colombia","Mexican Ancestry in Los Angeles, California",4
2,British in England and Scotland,Utah residents with Northern and Western Europ...,3
7,"Han Chinese in Bejing, China","Southern Han Chinese, China",3
1,African Caribbean in Barbados,"Yoruba in Ibadan, Nigeria",2
4,Esan in Nigeria,"Yoruba in Ibadan, Nigeria",1
5,"Gujarati Indian in Houston,TX","Punjabi in Lahore,Pakistan",1
6,"Han Chinese in Bejing, China","Japanese in Tokyo, Japan",1
8,Iberian populations in Spain,Puerto Rican in Puerto Rico,1


Some class labels are not mutually exclusive:

- African Ancestry in the Southwest US and African Caribbean in Barbados
- British in England and Scotland and Utah Residents with Northern and Western European Ancestry
- Han Chinese in Beijing and Southern Han Chinese
- African Caribbean in Bardados and Yoruba in Nigeria

These four groups account for ~51% of the error on the test set.