In [65]:
import pandas as pd

import numpy as np
import csv

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support

In [2]:
train = pd.read_csv('../data/conll/training.conll', sep="\t")
train = train[train['Word'].notna()]
val = pd.read_csv('../data/conll/validation.conll', sep="\t")
val = val[val['Word'].notna()]

In [46]:
# load w2id and create inverse
id2wrd = pd.read_csv('./conll_all_w2id.txt', names=['id', 'word'],sep= '\t',quoting=csv.QUOTE_NONE, encoding='utf-8')


In [55]:
# load embeddings
deepwalk_embeddings = {}
with open('./conll_all_deepwalk.embeddings', 'r') as f:
    next(f)
    for l in f:
        info = l.split(' ')
        deepwalk_embeddings[int(info[0])] = [float(i) for i in info[1:]]

# load labels
labels = pd.read_csv('./conll_all_labels.txt',sep="\t", names=['id', 'label'])
labels = dict(zip(labels.id, labels.label))

In [22]:
len(deepwalk_embeddings)

31735

In [59]:
# loading features and labels

word2id = dict(zip(id2wrd.word, id2wrd.id))


train_x = np.asarray([deepwalk_embeddings[word2id[str(w).lower()]] for w in train.Word])
test_x = np.asarray([deepwalk_embeddings[word2id[str(w).lower()]] for w in val.Word])

train_y = [labels[word2id[str(w).lower()]] for w in train.Word]
test_y = [labels[word2id[str(w).lower()]] for w in val.Word]

train_x.shape, test_x.shape

((209470, 128), (84483, 128))

In [62]:

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_x, train_y)
preds = clf.predict(test_x)



In [81]:
print(classification_report(test_y, preds, labels=[' B-MISC', ' I-MISC']))

              precision    recall  f1-score   support

      B-MISC       0.65      0.46      0.54      3969
      I-MISC       0.87      0.79      0.83      4661

   micro avg       0.78      0.64      0.70      8630
   macro avg       0.76      0.62      0.68      8630
weighted avg       0.77      0.64      0.70      8630



In [83]:
val['prediction'] = preds

In [84]:
val

Unnamed: 0,Id,Doc_Id,Sent_Id,Word,Tag,prediction
0,0.0,1.091375e+18,1.0,¿,O,O
1,1.0,1.091375e+18,1.0,Qué,O,O
2,2.0,1.091375e+18,1.0,probabilidad,O,O
3,3.0,1.091375e+18,1.0,hay,O,O
4,4.0,1.091375e+18,1.0,de,O,I-MISC
...,...,...,...,...,...,...
86993,88213.0,1.500171e+18,2500.0,no,O,O
86994,88214.0,1.500171e+18,2500.0,me,O,O
86995,88215.0,1.500171e+18,2500.0,logro,O,O
86996,88216.0,1.500171e+18,2500.0,calmar,O,O
