In [1]:
import json
import pandas as pd
import numpy as np
import utils
from sklearn import linear_model, svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
np.random.seed(42)

In [3]:
def evaluate(model, x, y):
    y_pred = model.predict(x)
    print(f'f1: {f1_score(y, y_pred)}')
    print(f'precision: {precision_score(y, y_pred)}')
    print(f'recall: {recall_score(y, y_pred)}')

In [4]:
x_train, y_train = utils.load_top_class_h_index_split('train')
x_val, y_val = utils.load_top_class_h_index_split('val')
test_pids, x_test, y_test = utils.load_top_class_h_index_split('test', include_pids=True)

In [5]:
preprocressor = StandardScaler().fit(x_train, y=y_train)

In [6]:
model = svm.LinearSVC(C=1e-2, class_weight='balanced', max_iter=1000000)
model.fit(preprocressor.transform(x_train), y_train)
evaluate(model, preprocressor.transform(x_train), y_train)
print(f'Val')
evaluate(model, preprocressor.transform(x_val), y_val)

f1: 0.26168224299065423
precision: 0.17160367722165476
recall: 0.5508196721311476
Val
f1: 0.29055690072639223
precision: 0.18404907975460122
recall: 0.6896551724137931


In [7]:
x_train, y_train = utils.join_splits([x_train, x_val], [y_train, y_val])

In [8]:
model = svm.LinearSVC(C=1e-2, class_weight='balanced', max_iter=1000000)
model.fit(preprocressor.transform(x_train), y_train)
evaluate(model, preprocressor.transform(x_train), y_train)
print(f'Test')
evaluate(model, preprocressor.transform(x_test), y_test)
h_index_decisions = model.decision_function(preprocressor.transform(x_test))
h_index_pids = test_pids.copy()
h_index_test = list(y_test).copy()
json.dump(dict(list(zip(h_index_pids, list(zip(h_index_decisions, h_index_test))))), 
          open('predictions/top_class_abf.json', 'w+'))

f1: 0.2609230769230769
precision: 0.17193836171938362
recall: 0.5408163265306123
Test
f1: 0.2831050228310502
precision: 0.17714285714285713
recall: 0.7045454545454546


In [9]:
x_train, y_train = utils.load_top_class_embeddings_split('train')
x_val, y_val = utils.load_top_class_embeddings_split('val')
test_ids, x_test, y_test = utils.load_top_class_embeddings_split('test', include_pids=True)

In [10]:
model = svm.LinearSVC(C=1e-3, class_weight='balanced', max_iter=1000000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print(f'Val')
evaluate(model, x_val, y_val)

f1: 0.5670202507232401
precision: 0.4016393442622951
recall: 0.9639344262295082
Val
f1: 0.28037383177570097
precision: 0.19230769230769232
recall: 0.5172413793103449


In [11]:
x_train, y_train = utils.join_splits([x_train, x_val], [y_train, y_val])  # add validation set to training data

In [12]:
model = svm.LinearSVC(C=1e-3, class_weight='balanced', max_iter=1000000)
model.fit(x_train, y_train)
evaluate(model, x_train, y_train)
print(f'Test')
evaluate(model, x_test, y_test)
embeddings_decisions = model.decision_function(x_test)
embeddings_pids = test_pids.copy()
embeddings_test = list(y_test).copy()
json.dump(dict(list(zip(embeddings_pids, list(zip(embeddings_decisions, embeddings_test))))), 
          open('predictions/top_class_embs.json', 'w+'))

f1: 0.539454806312769
precision: 0.37524950099800397
recall: 0.9591836734693877
Test
f1: 0.2619047619047619
precision: 0.1774193548387097
recall: 0.5


In [13]:
top_args = np.argsort(embeddings_decisions)[-10:]
embeddings_picks = set(np.array(embeddings_pids)[top_args])

In [14]:
top_args = np.argsort(h_index_decisions)[-10:]
h_index_picks = set(np.array(h_index_pids)[top_args])

In [15]:
embeddings_picks

{'0f407906d8a66a1df2a6b6eb721c785f12c27269',
 '1cb73db86c39d52a788b541ddb02e7ee743c882e',
 '491671b77ee9dee2eaa6691f85fae7d3834fbdbe',
 '49970d0410793aa284fc0adca1e5d42881f99e6e',
 '5c72c3c141ef77f44b2270a970a3d55b58af5ca7',
 '728d1de052a40aa41ef8813bf128fb2a6db22597',
 'ac3c4d1f128bef4180ed2cbb2cb02f5c45872424',
 'd8abac3f198f8a66ec4b466074b89c1eb2c042b6',
 'e0f0810e1938693ef2fa37a0e80d9dc6e8f5c4e6',
 'fd225c39db7e20768277da6b44ade77cf8405036'}

In [16]:
h_index_picks

{'38673093d908426ad70b17b9e230daf1d106aee8',
 '468da37ce0b0e77ae28628bd40287021446838e7',
 '475d92d6669a2d099293373731f353bf778465d3',
 '5a0dbfc0185158ab07c062f6f8ca1cd6b8f86d29',
 '5b5846bc384ecdaed7d6df43e2f744b98a446f44',
 '8ec550ce7e4d45fee4975cb5a1330d536b21fd6e',
 'bcbd7d07d0a233240a11a8157cc06e1b01718409',
 'd3305c6ea9f1121a322fac542375ac6ff96cfcde',
 'd5f2fe667c267f7cb1ed42fca638473b9d838c52',
 'e79a6b0de6ea568825304f2346751a6947b580b6'}

In [17]:
print(embeddings_picks.intersection(h_index_picks))

set()


In [18]:
# random baseline
y_pred = np.random.choice([0, 1], size=len(x_test), p=[0.5, 0.5]).tolist()
print(f'f1: {f1_score(y_test, y_pred)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')

f1: 0.19428571428571428
precision: 0.11670480549199085
recall: 0.5795454545454546


In [19]:
# matching probability baseline
y_pred = np.random.choice([0, 1], size=len(x_test), p=np.unique(y_test, return_counts=True)[1] / len(y_test)).tolist()
print(f'f1: {f1_score(y_test, y_pred)}')
print(f'precision: {precision_score(y_test, y_pred)}')
print(f'recall: {recall_score(y_test, y_pred)}')

f1: 0.1724137931034483
precision: 0.1744186046511628
recall: 0.17045454545454544
