In [1]:
import json
import pandas as pd
import numpy as np
import utils
from sklearn import linear_model, svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
np.random.seed(42)

In [3]:
def evaluate(model, x, y):
    y_pred = model.predict(x)
    print(f'rmse: {np.sqrt(mean_squared_error(y, y_pred))}')
    print(f'r2: {r2_score(y, y_pred)}')

In [4]:
x_train, y_train = utils.load_citation_h_index_split('train')
x_val, y_val = utils.load_citation_h_index_split('val')
test_pids, x_test, y_test = utils.load_citation_h_index_split('test', include_pids=True)

In [5]:
model = svm.LinearSVR(C=100, fit_intercept=True, max_iter=100000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print('Val')
evaluate(model, x_val, y_val)

rmse: 103.97348824077486
r2: 0.0021804835419348967
Val
rmse: 89.31744088215052
r2: 0.006659548852593011




In [6]:
x_train, y_train = utils.join_splits([x_train, x_val], [y_train, y_val])

In [7]:
model = svm.LinearSVR(C=1, fit_intercept=True, max_iter=100000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print('Test')
evaluate(model, x_test, y_test)
h_index_decisions = model.predict(x_test)
h_index_pids = test_pids.copy()

rmse: 102.40093225182942
r2: -0.005247423333121448
Test
rmse: 101.42165983316357
r2: -0.0049143176278740874


In [8]:
x_train, y_train = utils.load_citation_embeddings_split('train')
x_val, y_val = utils.load_citation_embeddings_split('val')
test_ids, x_test, y_test = utils.load_citation_embeddings_split('test', include_pids=True)

In [9]:
model = svm.LinearSVR(C=10, fit_intercept=True, max_iter=1000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print('Val')
evaluate(model, x_val, y_val)

rmse: 100.37012173272312
r2: 0.07014406616708968
Val
rmse: 90.36449293520099
r2: -0.016766457926515654




In [10]:
x_train, y_train = utils.join_splits([x_train, x_val], [y_train, y_val])

In [11]:
model = svm.LinearSVR(C=1, fit_intercept=True, max_iter=100000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print('Test')
evaluate(model, x_test, y_test)
embeddings_decisions = model.predict(x_test)
embeddings_pids = test_pids.copy()

rmse: 99.02239360508361
r2: 0.059991051813353735
Test
rmse: 99.05308596024963
r2: 0.041474597089804055


In [12]:
top_args = np.argsort(embeddings_decisions)[-10:]
embeddings_picks = set(np.array(embeddings_pids)[top_args])

In [13]:
top_args = np.argsort(h_index_decisions)[-10:]
h_index_picks = set(np.array(h_index_pids)[top_args])

In [14]:
embeddings_picks

{'0757aaa5dfe2a56aa1d6149631f3ca0964128e1f',
 '1499a526dcb939af44f545ed8de8d8fe4757d821',
 '14be13d44319e8589f46ee484e5039e3a592e6b4',
 '3ab7321faa3649c2837e5849efa364266a291b47',
 '617df8d7573ed3740b445f7267f5d908d11c86b6',
 '8bcbdf9382cd8c38459492c80869eb62fbcacdab',
 '9ff52479dbbd804b495c97f7822c2d4e61083eb2',
 'cc0e4307e90401906ef14c27ef83512e927a9471',
 'f8d173b07d377281b9ed920e0f98f56a2cd07136',
 'fff238844076ad5643dc2ff53153581bd89441ea'}

In [15]:
h_index_picks

{'38673093d908426ad70b17b9e230daf1d106aee8',
 '468da37ce0b0e77ae28628bd40287021446838e7',
 '475d92d6669a2d099293373731f353bf778465d3',
 '5a0dbfc0185158ab07c062f6f8ca1cd6b8f86d29',
 '8ec550ce7e4d45fee4975cb5a1330d536b21fd6e',
 'bcbd7d07d0a233240a11a8157cc06e1b01718409',
 'c6368fbf529af9cce5c49cadc65ea92f6595c066',
 'd3305c6ea9f1121a322fac542375ac6ff96cfcde',
 'd5f2fe667c267f7cb1ed42fca638473b9d838c52',
 'e79a6b0de6ea568825304f2346751a6947b580b6'}

In [16]:
print(embeddings_picks.intersection(h_index_picks))

set()
