## Embeddings

In [1]:
import pandas as pd
import numpy as np 

from sklearn.metrics import confusion_matrix

from class_NN import neuralNetwork
from utils import extract_info, get_embeddings

In [2]:
# insert path to embeddings model here
path = "/Users/alessandrapolimeno/Documents/VU/models/sonar-160.tar"

### Load data

In [3]:
# load training data 
path = "../data/pos_data_train.csv"
with open(path, "r") as infile: 
    train_data = infile.readlines()

# load test data
path = "../data/pos_data_test.csv"
with open(path, "r") as infile: 
    test_data = infile.readlines()

In [4]:
train_data = train_data[:1000]
test_data = test_data[:100]

In [5]:
data_all = test_data + train_data # for training the embeddings model on 

### Extact information

In [6]:
# get relevant information from datasets 
data_all, tokens, pos_tags, targets = extract_info(data_all)
data_tr, tokens_tr, pos_tags_tr, targets_tr = extract_info(train_data)
data_te, tokens_te, pos_tags_te, targets_te = extract_info(test_data)

### Initialize NN

In [7]:
# number of nodes 
input_nodes = 160
# is the number of the vocabulary (dimensions)
hidden_nodes = 150
output_nodes = 12 # equals number of possible labels 

# learning rate 
learning_rate = 0.3

# create instance of neural network 
nn = neuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)

In [8]:
print(tokens[:20])

['dat', 'is', 'in', 'italië', ',', 'spanje', 'of', 'engeland', 'misschien', 'geen', 'probleem', ',', 'maar', 'volgens', "'", 'der', 'kaiser', "'", 'in', 'duitsland']


In [9]:
# get word embeddings for tokens 
#embeddings_model = get_embeddings(path, tokens)

In [14]:
file = open(path)
c = 0
for line in file: 
    if c < 10: 
        print(line)
    
        records = line.split()
        if records:
            word = records[0]
            c += 1
            if word in tokens:
                print(word)

Dat;9

is;11

in;8

Italië;6

,;10

Spanje;6

of;3

Engeland;6

misschien;1

geen;9



In [15]:
def get_embeddings(embeddings_path, tokens, dimension=160):
    """
    This code was taken and adapted from the following tutorial: 
    https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
    (accessed 24 Jan 2021)
    
    :param path: path to downloaded embeddings 
    :param tokens: list containing all tokens that occur in the data
    :param dimension: default = 160 
    
    :return embeddings_dictionary: dict containing embeddings for each token (key = token, values = embeddings)
    
    """
    
    embeddings_dictionary = dict()
    file = open(embeddings_path)

    for line in file:
        records = line.split()
        if records:
            word = records[0]
            if word in tokens:
                try:
                    vector_dimensions = asarray(records[1:], dtype='float32')
                except:
                    pass
            else: 
                vector_dimensions = [0]*dimension
            embeddings_dictionary[word] = vector_dimensions
    file.close()
    return embeddings_dictionary



In [15]:
c = 0
if c < 10:
    for emb in embeddings_model: 
        print(emb)
        c += 1

Dat;9
is;11
in;8
Italië;6
,;10
Spanje;6
of;3
Engeland;6
misschien;1
geen;9
probleem;6
maar;3
volgens;8
';10
Der;6
Kaiser;6
Duitsland;6
wel;1
.;10
sectie;6
Zürich;0
editie;6
De;2
kleine;0
clubs;6
waarvan;1
de;2
begroting;6
dikwijls;1
afhankelijk;0
van;8
opleiding;6
en;3
transfers;6
klagen;11
steen;6
been;6
over;8
dat;9
voornemen;6
Andere;9
zijn;9
bang;0
dat;3
salarissen;6
straks;1
helemaal;1
pan;6
uitrijzen;11
auteur;6
Uefa;6
Fifa;6
zijn;11
bereid;0
afschaffing;6
transfervergoeding;6
te;8
aanvaarden;11
als;3
enkel;9
geldt;11
voor;8
voetballers;6
ouder;0
dan;3
24;7
jaar;6
EU;6
wil;11
af;1
elke;9
vorm;6
financiële;0
schadeloosstelling;6
publicatie;6
nog;1
bescheidenere;0
competities;6
zoals;3
Belgische;0
durven;11
daar;1
niet;1
eens;1
aan;1
denken;11
Morgen;1
Europese;0
wereldvoetbalbond;11
willen;11
spelers;6
14;7
maakt;11
het;9
interessant;0
om;3
blijven;11
investeren;11
eigen;9
talent;6
Bij;8
onder;8
24;6
zou;11
bedingen;11
een;2
vergoeding;6
mogelijk;0
moeten;11
besluit;6
Bosman-arres

### Training

In [10]:
epochs = 5
for e in range(epochs):
    for inp, targ in zip(tokens_tr , targets_tr):
        inp = inp.lower()
        if inp in embeddings_model:
            embedding = embeddings_model[inp]
            nn.train(embedding, targ) 
        else:
            pass

### Testing

In [12]:
scorecard = []
predicted_labels = []
selected_targets = []
#predicted_labels = []
for inp, targ in zip(tokens_te, targets_te):
    inp = inp.lower()
    try: 
        embedding = embeddings_model[inp]
        
        # query network 
        outputs = nn.query(embedding)

        # highest number == label 
        label = np.argmax(outputs)
        predicted_labels.append(label)
        selected_targets.append(targ.index(max(targ)))

        # append correct / incorrect to scorecard 
        #print(targ.index(max(targ)))
        if label == targ.index(max(targ)):
            scorecard.append(1)
        else:
            scorecard.append(0)
            
    except: 
        pass

    
scorecard_array = np.asarray(scorecard)
print(f"performance = ", scorecard_array.sum() / scorecard_array.size)

[]
performance =  nan


  print(f"performance = ", scorecard_array.sum() / scorecard_array.size)


### Evaluation

In [33]:
# Find out the labels that are present (optional)
target_set = set()
for num in selected_targets:
    target_set.add(num)
print(target_set)

{0, 1, 2, 3, 6, 8, 9, 11}


In [32]:
# print confustion matrix 
cm = confusion_matrix(selected_targets, predicted_labels)
df_confusion = pd.DataFrame.from_records(cm)
#print(df_confusion)
print(df_confusion.to_latex())

\begin{tabular}{lrrrrrrrr}
\toprule
{} &  0 &  1 &  2 &  3 &   4 &   5 &  6 &  7 \\
\midrule
0 &  1 &  0 &  0 &  0 &   2 &   0 &  0 &  4 \\
1 &  0 &  7 &  0 &  0 &   0 &   0 &  0 &  1 \\
2 &  0 &  0 &  8 &  0 &   0 &   0 &  0 &  0 \\
3 &  0 &  2 &  0 &  3 &   0 &   3 &  0 &  0 \\
4 &  0 &  0 &  0 &  0 &  29 &   0 &  0 &  0 \\
5 &  0 &  0 &  0 &  0 &   0 &  10 &  0 &  0 \\
6 &  1 &  0 &  0 &  3 &   0 &   0 &  3 &  1 \\
7 &  0 &  0 &  0 &  0 &   1 &   0 &  0 &  7 \\
\bottomrule
\end{tabular}

