In [1]:
import json
import pandas as pd
import numpy as np
import utils
from sklearn import linear_model, svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [2]:
np.random.seed(42)

In [3]:
def evaluate(model, x, y):
    y_pred = model.predict(x)
    print(f'f1: {f1_score(y, y_pred)}')

In [4]:
x_train, y_train = utils.load_top_class_h_index_split('train')
x_val, y_val = utils.load_top_class_h_index_split('val')
test_pids, x_test, y_test = utils.load_top_class_h_index_split('test', include_pids=True)

In [5]:
preprocressor = StandardScaler().fit(x_train, y=y_train)

In [6]:
model = svm.LinearSVC(C=1e-2, class_weight='balanced', max_iter=1000000)
model.fit(preprocressor.transform(x_train), y_train)
evaluate(model, preprocressor.transform(x_train), y_train)
print(f'Val')
evaluate(model, preprocressor.transform(x_val), y_val)

f1: 0.26168224299065423

f1: 0.29055690072639223


In [7]:
x_train, y_train = utils.join_splits([x_train, x_val], [y_train, y_val])

In [8]:
model = svm.LinearSVC(C=1e-2, class_weight='balanced', max_iter=1000000)
model.fit(preprocressor.transform(x_train), y_train)
evaluate(model, preprocressor.transform(x_train), y_train)
print(f'Test')
evaluate(model, preprocressor.transform(x_test), y_test)
h_index_decisions = model.decision_function(preprocressor.transform(x_test))
h_index_pids = test_pids.copy()

f1: 0.2609230769230769

f1: 0.2831050228310502


In [9]:
x_train, y_train = utils.load_top_class_embeddings_split('train')
x_val, y_val = utils.load_top_class_embeddings_split('val')
test_ids, x_test, y_test = utils.load_top_class_embeddings_split('test', include_pids=True)

In [10]:
model = svm.LinearSVC(C=1e-3, class_weight='balanced', max_iter=1000000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print(f'Val')
evaluate(model, x_val, y_val)

f1: 0.5670202507232401

f1: 0.28037383177570097


In [11]:
x_train, y_train = utils.join_splits([x_train, x_val], [y_train, y_val])

In [14]:
model = svm.LinearSVC(C=1e-3, class_weight='balanced', max_iter=1000000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print(f'Test')
evaluate(model, x_test, y_test)
embeddings_decisions = model.decision_function(x_test)
embeddings_pids = test_pids.copy()

f1: 0.539454806312769

f1: 0.2619047619047619


In [26]:
top_args = np.argsort(embeddings_decisions)[-10:]
embeddings_picks = set(np.array(embeddings_pids)[top_args])

In [27]:
top_args = np.argsort(h_index_decisions)[-10:]
h_index_picks = set(np.array(h_index_pids)[top_args])

In [30]:
embeddings_picks

{'0f407906d8a66a1df2a6b6eb721c785f12c27269',
 '1cb73db86c39d52a788b541ddb02e7ee743c882e',
 '491671b77ee9dee2eaa6691f85fae7d3834fbdbe',
 '49970d0410793aa284fc0adca1e5d42881f99e6e',
 '5c72c3c141ef77f44b2270a970a3d55b58af5ca7',
 '728d1de052a40aa41ef8813bf128fb2a6db22597',
 'ac3c4d1f128bef4180ed2cbb2cb02f5c45872424',
 'd8abac3f198f8a66ec4b466074b89c1eb2c042b6',
 'e0f0810e1938693ef2fa37a0e80d9dc6e8f5c4e6',
 'fd225c39db7e20768277da6b44ade77cf8405036'}

In [31]:
h_index_picks

{'38673093d908426ad70b17b9e230daf1d106aee8',
 '468da37ce0b0e77ae28628bd40287021446838e7',
 '475d92d6669a2d099293373731f353bf778465d3',
 '5a0dbfc0185158ab07c062f6f8ca1cd6b8f86d29',
 '5b5846bc384ecdaed7d6df43e2f744b98a446f44',
 '8ec550ce7e4d45fee4975cb5a1330d536b21fd6e',
 'bcbd7d07d0a233240a11a8157cc06e1b01718409',
 'd3305c6ea9f1121a322fac542375ac6ff96cfcde',
 'd5f2fe667c267f7cb1ed42fca638473b9d838c52',
 'e79a6b0de6ea568825304f2346751a6947b580b6'}

In [32]:
print(embeddings_picks.intersection(h_index_picks))

set()
