In [None]:
from gensim.models import KeyedVectors
import numpy as np
import os 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
os.environ['PYTHONHASHSEED']='123'
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer, matthews_corrcoef
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE

In [None]:
def load_genage_embeddings(embedding_file, genage_data):
    model = pd.read_csv(embedding_file, sep = ' ', header = None, skiprows = 1)
    model = model.rename(columns={0: 'STRING ID'})
    model_df = pd.merge(model, genage_data, on = ['STRING ID'])
    X = model_df.values[:, 1:(model_df.shape[1]-1)]
    y = LabelEncoder().fit_transform(model_df.values[:, model_df.shape[1]-1])
    return [X, y]

In [None]:
genage_data = pd.read_csv('genage_full.txt', sep = '\t')  
genage_data = genage_data[['STRING ID', 'Longevity Influence']]

In [None]:
def svc_crossvalidation(model_df):
    classifier = SVC()
    X = model_df[0]
    y = model_df[1]
    MCC = make_scorer(matthews_corrcoef)
    param_grid = [
    {'C': [1, 10, 0.1, 100, 0.01, 1000, 0.001], 'kernel': ['linear']},
    {'C': [1, 10, 0.1, 100, 0.01, 1000, 0.001],
      'gamma': [1, 0.5, 3, 0.2, 10, 0.1, 0.03, 0.01, 0.001], 'kernel': ['rbf']}]
    clf1 = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring= 'f1', cv=5)
    clf2 = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring= 'accuracy', cv=5)
    clf3 = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring= 'roc_auc', cv=5)
    clf4 = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring= MCC, cv=5)
    clf1.fit(X, y)
    clf2.fit(X, y)
    clf3.fit(X, y)
    clf4.fit(X, y)
    return [clf1.best_params_, clf2.best_params_, clf3.best_params_, clf4.best_params_, 
            clf1.best_score_, clf2.best_score_, clf3.best_score_, clf4.best_score_,
           clf1.cv_results_['std_test_score'][clf1.best_index_],
           clf2.cv_results_['std_test_score'][clf2.best_index_],
           clf3.cv_results_['std_test_score'][clf3.best_index_],
           clf4.cv_results_['std_test_score'][clf4.best_index_]]

In [None]:
noscore_32 = load_genage_embeddings('Noscore 32.txt', genage_data)
noscore_64 = load_genage_embeddings('Noscore 64.txt', genage_data)
noscore_128 = load_genage_embeddings('Noscore 128.txt', genage_data)
noscore_256 = load_genage_embeddings('Noscore 256.txt', genage_data)
noscore_512 = load_genage_embeddings('Noscore 512.txt', genage_data)
noscore_p25_q200 = load_genage_embeddings('model_p25_q200_NoScore.txt', genage_data)
noscore_p50_q100 = load_genage_embeddings('model_p50_q100_NoScore.txt', genage_data)
noscore_p200_q25 = load_genage_embeddings('model_p200_q25_NoScore.txt', genage_data)

In [None]:
svc_32 = svc_crossvalidation(noscore_32)
svc_64 = svc_crossvalidation(noscore_64)
svc_128 = svc_crossvalidation(noscore_128)
svc_256 = svc_crossvalidation(noscore_256)
svc_512 = svc_crossvalidation(noscore_512)
svc_p25 = svc_crossvalidation(noscore_p25_q200)
svc_p50 = svc_crossvalidation(noscore_p50_q100)
svc_p200 = svc_crossvalidation(noscore_p200_q25)

In [None]:
svc_metrics = pd.DataFrame([svc_32[4:8], 
                            svc_64[4:8], 
                            svc_128[4:8],
                            svc_256[4:8],
                            svc_512[4:8],
                            svc_p25[4:8],
                            svc_p50[4:8],
                            svc_p200[4:8]], columns = ['F1', 'Accuracy', 'AUROC', 'MCC'])
svc_metrics.head(20)

Unnamed: 0,F1,Accuracy,AUROC,MCC
0,0.513511,0.646886,0.674411,0.237417
1,0.502155,0.649384,0.677112,0.230828
2,0.539681,0.644289,0.695431,0.237544
3,0.490725,0.644156,0.700844,0.221248
4,0.475806,0.62361,0.679102,0.178701
5,0.527256,0.667699,0.691788,0.275226
6,0.488565,0.636563,0.677091,0.200769
7,0.441793,0.628671,0.646898,0.148082
