In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import balanced_accuracy_score
import pickle

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
data = pd.read_pickle("4000_saved_master1.pkl")

In [3]:
data.head()
data.shape

Unnamed: 0,ALA,ALA_0,ALA_1,ALA_2,ALA_3,ALA_4,ALA_5,ALA_6,ALA_7,ALA_8,...,sin_psi,sin_psi_0,sin_psi_1,sin_psi_2,sin_psi_3,sin_psi_4,sin_psi_5,sin_psi_6,sin_psi_7,sin_psi_8
0,0,0,0,0,0,1,0,0,0,0,...,0.165775,0.235706,0.600179,-0.384713,-0.301993,0.581275,0.314405,0.64819,-0.564456,0.397495
1,0,0,0,0,1,0,0,0,0,0,...,0.235706,0.600179,-0.384713,-0.301993,0.581275,0.314405,0.64819,-0.564456,0.397495,0.876304
2,0,0,0,1,0,0,0,0,0,0,...,0.600179,-0.384713,-0.301993,0.581275,0.314405,0.64819,-0.564456,0.397495,0.876304,0.775095
3,0,0,1,0,0,0,0,0,0,0,...,-0.384713,-0.301993,0.581275,0.314405,0.64819,-0.564456,0.397495,0.876304,0.775095,0.564654
4,0,1,0,0,0,0,0,0,0,0,...,-0.301993,0.581275,0.314405,0.64819,-0.564456,0.397495,0.876304,0.775095,0.564654,-0.805682


(980782, 810)

In [20]:
# take only 10% of the data
df_10_percent = data.sample(frac=0.1)

In [21]:
df_10_percent.shape
df_10_percent.head()

(98078, 810)

Unnamed: 0,ALA,ALA_0,ALA_1,ALA_2,ALA_3,ALA_4,ALA_5,ALA_6,ALA_7,ALA_8,...,sin_psi,sin_psi_0,sin_psi_1,sin_psi_2,sin_psi_3,sin_psi_4,sin_psi_5,sin_psi_6,sin_psi_7,sin_psi_8
544774,0,0,0,0,1,1,0,0,0,0,...,-0.274387,0.410341,0.905624,-0.197778,0.455244,0.999724,0.955991,-0.0575084,0.841325,0.857653
323683,0,0,0,0,0,0,0,0,0,0,...,0.264434,0.41343,0.597907,0.708736,-0.309067,-0.607254,0.0371558,-0.61793,-0.0666131,-0.18682
905559,0,0,0,1,0,0,0,1,1,0,...,0.67678,0.402035,-0.286934,0.628367,0.235793,0.205248,-0.716128,-0.567317,-0.690508,-0.754169
584035,0,0,0,0,0,0,1,0,0,0,...,-0.79434,-0.675151,-0.541691,0.326468,0.958126,-0.454601,-0.505097,0.00225212,-0.519678,-0.766986
403049,0,0,0,0,0,0,0,0,0,0,...,0.654244,0.390082,0.691314,-0.252249,0.43303,0.131821,-0.561668,-0.371581,-0.726418,-0.235419


In [22]:
# get list of unique pdbs in this sample
list_pdbs = list(df_10_percent.pdbId.unique())
len(list_pdbs)

3968

In [23]:
# spilt into train and test sets (keeping pdbid separate)
def split(list_of_ids, frac=0.70, seed=1):
    n_train = np.floor(len(list_of_ids)*frac)
    shuffled = np.random.shuffle(list_of_ids)
    train_IDs = list_of_ids[0:int(n_train)]
    test_IDs = list_of_ids[int(n_train):]

    return train_IDs, test_IDs

In [24]:
# get training and validation data
split = split(list_pdbs)

train_data = df_10_percent[df_10_percent.pdbId.isin(split[0])]
test_data = df_10_percent[df_10_percent.pdbId.isin(split[1])]
train_data.shape
test_data.shape

(68770, 810)

(29308, 810)

In [25]:
# remove columns from data that are not important
train_data = train_data.drop(train_data.columns[train_data.columns.str.startswith('pdbId')], axis=1)
train_data = train_data.drop(train_data.columns[train_data.columns.str.startswith('chain')], axis=1)
test_data = test_data.drop(test_data.columns[test_data.columns.str.startswith('pdbId')], axis=1)
test_data = test_data.drop(test_data.columns[test_data.columns.str.startswith('chain')], axis=1)

In [26]:
# get training and validation data without labels
train_data_nolabel = train_data.drop(['is_cap'], axis=1)
test_data_nolabel = test_data.drop(['is_cap'], axis=1)

In [27]:
train_data.shape
train_data_nolabel.shape
test_data.shape
test_data_nolabel.shape

(68770, 790)

(68770, 789)

(29308, 790)

(29308, 789)

In [28]:
# convert to int type for correct input format to svm
train_data['is_cap'] = train_data['is_cap'].astype('int')
train_data.is_cap.dtype
test_data['is_cap'] = test_data['is_cap'].astype('int')
test_data.is_cap.dtype

dtype('int64')

dtype('int64')

In [None]:
# use pca to reduce dimensionality and then fit svm classifier
%timeit
pca = PCA(n_components=40)
pca.fit(train_data_nolabel)

reduced_train = pca.transform(train_data_nolabel)
reduced_test = pca.transform(test_data_nolabel)

# tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.1], 'C': [1]},
#                     {'kernel': ['linear'], 'C': [1, 10]}]

# classifier = GridSearchCV(svm.SVC(), tuned_parameters, cv=10, scoring = make_scorer(balanced_accuracy_score))
# classifier.fit(reduced_train, train_data['is_cap'])

# print('Best params')
# print(classifier.best_params_)

classifier = svm.SVC()
classifier.fit(reduced_train, train_data['is_cap'])

# predict on test data
predicted_pca = classifier.predict(reduced_test)
                                
# calculate class balanced accuracy
balanced_accuracy_score(test_data['is_cap'], predicted_pca)

print(classification_report(test_data['is_cap'], predicted_pca))

PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [32]:
print('hello')

hello


In [33]:
# predict on test data
predicted_pca = classifier.predict(reduced_test)
                                
# calculate class balanced accuracy
balanced_accuracy_score(test_data['is_cap'], predicted_pca)

print(classification_report(test_data['is_cap'], predicted_pca))

0.502563735054111

              precision    recall  f1-score   support

           0       0.87      1.00      0.93     25449
           1       0.72      0.01      0.01      3859

   micro avg       0.87      0.87      0.87     29308
   macro avg       0.80      0.50      0.47     29308
weighted avg       0.85      0.87      0.81     29308



In [19]:
# fit the classifier on train data
%timeit

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.1, 1, 10, 100], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# classifier = GridSearchCV(svm.SVC(), tuned_parameters, cv=10, scoring = make_scorer(balanced_accuracy_score))
# classifier.fit(train_data_nolabel, train_data['is_cap'])

# print('Best params')
# print(classifier.best_params_)

classifier = svm.SVC(gamma=100)
classifier.fit(train_data_nolabel, train_data['is_cap'])

# predict on test data
predicted = classifier.predict(test_data_nolabel)
                                
# calculate class balanced accuracy
balanced_accuracy_score(test_data['is_cap'], predicted)

print(classification_report(test_data['is_cap'], predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=100, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

0.5

              precision    recall  f1-score   support

           0       0.85      1.00      0.92      2494
           1       0.00      0.00      0.00       431

   micro avg       0.85      0.85      0.85      2925
   macro avg       0.43      0.50      0.46      2925
weighted avg       0.73      0.85      0.78      2925



  'precision', 'predicted', average, warn_for)
