<a href="https://colab.research.google.com/github/andrefs/mapi-faml-proj/blob/main/code/faml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load stuff


## Libraries

In [1]:
import csv
import numpy as np
from tensorflow.keras.preprocessing import sequence
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBRegressor

import urllib.request

## Embeddings

In [2]:
# currently not being used
def normalize(word_vec):
    norm=np.linalg.norm(word_vec)
    if norm == 0: 
       return word_vec
    return word_vec/norm

embs = {}
url = 'https://raw.githubusercontent.com/andrefs/mapi-faml-proj/main/2_clean_datasets/embeddings.txt'
response = urllib.request.urlopen(url)
lines = [l.decode('utf-8') for l in response.readlines()]
reader = csv.reader(lines, delimiter=' ')
for line in reader:
    term = line[0].replace('http://dbpedia.org/resource/' , '')
    vector = [float(x) for x in line[1:]]
    embs[term] = vector


len(embs.keys())

10706

## Relatedness pairs

In [3]:
relness = {}
url = 'https://raw.githubusercontent.com/andrefs/mapi-faml-proj/main/2_clean_datasets/relatedness.tsv'
response = urllib.request.urlopen(url)
lc = 0
lines = [l.decode('utf-8') for l in response.readlines()]
reader = csv.reader(lines, delimiter='\t')
for line in reader:
    lc += 1
    t1 = line[0].replace('http://dbpedia.org/resource/' , '')
    t2 = line[1].replace('http://dbpedia.org/resource/' , '')
    rel = float(line[2])
    relness[t1] = relness.get(t1,{})
    relness[t1][t2] = [float(line[2]), line[3]] # relatedness value and subset (train/test)

lc

18800

# Merge data from both sources

In [4]:
X_m = []
S_m = []
Y_m = []
for t1 in relness:
    for t2 in relness[t1]:
        Y_m.append(float(relness[t1][t2][0]))
        S_m.append(relness[t1][t2][1])
        X_m.append(embs[t1]+embs[t2])
        #X_m.append(np.concatenate((embs[t1],embs[t2])))
        

# Use Numpy
Y = np.array(Y_m)
X = np.matrix(X_m).astype(float)
S = np.array(S_m)

# Use pandas
#X_train = pd.DataFrame(X[np.in1d(S[:], 'Train')])
#X_test  = pd.DataFrame(X[np.in1d(S[:], 'Test')])
#Y_train = pd.DataFrame(Y[np.in1d(S[:], 'Train')])
#Y_test  = pd.DataFrame(Y[np.in1d(S[:], 'Test')])


X_train = X[np.in1d(S[:], 'Train')]
X_test  = X[np.in1d(S[:], 'Test')]
Y_train = Y[np.in1d(S[:], 'Train')]
Y_test  = Y[np.in1d(S[:], 'Test')]

# Insert target column into dataset
#X_train['Target'] = Y_train
# 200 columns = embeddings t1
# 200 columns = embeddings t2
#   1 column  = relatedness value (target)
#X_train.describe()

## Hyperparameter optimization

In [5]:
def setup_model(topo, dropout_rate, input_size, output_size):
    model = keras.Sequential()
    model.add(keras.layers.Dense(topo[0], activation="relu", input_dim = input_size))
    if dropout_rate > 0: model.add(keras.layers.Dropout(dropout_rate))
    for i in range(1,len(topo)):
        model.add(keras.layers.Dense(topo[i], activation="relu"))
        if dropout_rate > 0: model.add(keras.layers.Dropout(dropout_rate))
    model.add(keras.layers.Dense(output_size))
    model.add(keras.layers.Activation('softmax'))
    return model


def train_dnn(model, alg, lr, Xtrain, Ytrain, epochs = 5, batch_size = 64):
    if alg == "adam":
        optimizer = keras.optimizers.Adam(lr = lr)
    elif alg == "rmsprop":
        optimizer = keras.optimizers.RMSprop(lr = lr)
    elif alg == "sgd_momentum":
        optimizer = keras.optimizers.SGD(lr = lr, momentum = 0.9)
    else: optimizer = keras.optimizers.SGD(lr = lr)
    model.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy"])
    model.fit(Xtrain, Ytrain, epochs = epochs, batch_size = batch_size, verbose = 0)
    return model

def dnn_optimization(opt_params, Xtrain, Ytrain, Xval, Yval, iterations = 10, verbose = True):
    from random import choice
    if verbose: print("Topology\tDropout\tAlgorithm\tLRate\tValLoss\tValAcc\n")
    best_acc = None

    Ytrain = Ytrain.reshape(-1,1)
    input_size = Xtrain.shape[1]
    output_size = Ytrain.shape[1]

    if "topology" in opt_params: topologies = opt_params["topology"]
    else: topologies = [[100]]
    if "algorithm" in opt_params: algs = opt_params["algorithm"]
    else: algs = ["adam"]
    if "lr" in opt_params: lrs = opt_params["lr"]
    else: lrs = [0.001]
    if "dropout" in opt_params: dropouts = opt_params["dropout"]
    else: dropouts= [0.0]
    for it in range(iterations):
        topo = choice(topologies)
        dropout_rate = choice(dropouts)
        dnn = setup_model (topo, dropout_rate, input_size, output_size)
        alg = choice(algs)
        lr = choice(lrs)
        dnn = train_dnn(dnn, alg, lr, Xtrain, Ytrain)
        val_loss, val_acc = dnn.evaluate(Xval, Yval, verbose = 0)

        if verbose:
            print(topo, "\t", dropout_rate, "\t", alg, "\t", lr, "\t", val_loss, "\t", val_acc)
        
        if best_acc is None or val_acc > best_acc:
            best_acc = val_acc
            best_config = (topo, dropout_rate, alg, lr)
    return best_config, best_acc
                         
                         

opt_pars = {"topology":[[100,50], [400,600,600,200,100]], "algorithm": [ "adam", "rmsprop", "sgd_momentum"], "lr": [0.01, 0.001], "dropout": [0, 0.2, 0.5]}
best_config, best_acc = dnn_optimization(opt_pars, X_train, Y_train, X_test, Y_test)
print(best_config)
print(best_acc)

Topology	Dropout	Algorithm	LRate	ValLoss	ValAcc

[100, 50] 	 0 	 adam 	 0.001 	 0.0 	 0.03424213081598282
[400, 600, 600, 200, 100] 	 0 	 sgd_momentum 	 0.01 	 nan 	 0.4961051642894745
[100, 50] 	 0.2 	 rmsprop 	 0.001 	 0.0 	 0.03424213081598282
[100, 50] 	 0.5 	 rmsprop 	 0.001 	 0.0 	 0.03424213081598282
[100, 50] 	 0 	 adam 	 0.001 	 0.0 	 0.03424213081598282
[400, 600, 600, 200, 100] 	 0.5 	 rmsprop 	 0.001 	 0.0 	 0.03424213081598282
[100, 50] 	 0 	 rmsprop 	 0.001 	 0.0 	 0.03424213081598282
[100, 50] 	 0.2 	 adam 	 0.001 	 0.0 	 0.03424213081598282
[400, 600, 600, 200, 100] 	 0.2 	 adam 	 0.001 	 0.0 	 0.03424213081598282
[100, 50] 	 0.2 	 sgd_momentum 	 0.01 	 nan 	 0.4961051642894745
([400, 600, 600, 200, 100], 0, 'sgd_momentum', 0.01)
0.4961051642894745


In [6]:
X_test

matrix([[ 7.4500e-03,  4.5770e-03, -1.9916e-02, ...,  7.8440e-03,
          4.9000e-05,  4.7807e-02],
        [ 7.4500e-03,  4.5770e-03, -1.9916e-02, ...,  3.1100e-03,
          2.1360e-03, -3.5700e-03],
        [-1.5811e-02, -1.4104e-02,  6.1710e-03, ...,  3.2520e-03,
          3.4900e-03, -1.8630e-02],
        ...,
        [ 2.3294e-02,  3.8200e-03, -8.4220e-03, ..., -7.7490e-03,
         -7.0800e-04, -2.1870e-02],
        [ 1.3664e-02,  1.9900e-04, -2.5430e-02, ..., -1.3498e-02,
          5.4900e-04, -1.0503e-02],
        [ 3.0580e-03, -7.9860e-03, -3.4400e-02, ..., -6.2000e-05,
          8.0900e-04, -4.6170e-03]])