<a href="https://colab.research.google.com/github/andrefs/mapi-faml-proj/blob/main/code/faml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load stuff


## Libraries

In [1]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBRegressor

import urllib.request

## Embeddings

In [2]:
# currently not being used
def normalize(word_vec):
    norm=np.linalg.norm(word_vec)
    if norm == 0: 
       return word_vec
    return word_vec/norm

embs = {}
url = 'https://raw.githubusercontent.com/andrefs/mapi-faml-proj/main/2_clean_datasets/embeddings.txt'
response = urllib.request.urlopen(url)
lines = [l.decode('utf-8') for l in response.readlines()]
reader = csv.reader(lines, delimiter=' ')
for line in reader:
    term = line[0].replace('http://dbpedia.org/resource/' , '')
    vector = [float(x) for x in line[1:]]
    embs[term] = vector


len(embs.keys())

10706

## Relatedness pairs

In [5]:
relness = {}
url = 'https://raw.githubusercontent.com/andrefs/mapi-faml-proj/main/2_clean_datasets/relatedness_train.tsv'
response = urllib.request.urlopen(url)
lc = 0
lines = [l.decode('utf-8') for l in response.readlines()]
reader = csv.reader(lines, delimiter='\t')
for line in reader:
    lc += 1
    t1 = line[0].replace('http://dbpedia.org/resource/' , '')
    t2 = line[1].replace('http://dbpedia.org/resource/' , '')
    rel = float(line[2])
    relness[t1] = relness.get(t1,{})
    relness[t1][t2] = [float(line[2]), line[3]] # relatedness value and subset (train/test)

lc

12637

# Merge data from both sources

In [7]:
X_m = []
S_m = []
Y_m = []
for t1 in relness:
    for t2 in relness[t1]:
        Y_m.append(float(relness[t1][t2][0]))
        S_m.append(relness[t1][t2][1])
        X_m.append(np.concatenate((embs[t1],embs[t2])))
        

# Use Numpy
Y = np.array(Y_m)
X = np.matrix(X_m).astype(float)
S = np.array(S_m)

# Use pandas
X_train = pd.DataFrame(X[np.in1d(S[:], 'Train')])
X_test  = pd.DataFrame(X[np.in1d(S[:], 'Test')])
Y_train = pd.DataFrame(Y[np.in1d(S[:], 'Train')])
Y_test  = pd.DataFrame(Y[np.in1d(S[:], 'Test')])

# Insert target column into dataset
X_train['Target'] = Y_train

# 200 columns = embeddings t1
# 200 columns = embeddings t2
#   1 column  = relatedness value (target)
X_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,Target
count,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,...,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0,12634.0
mean,-0.005784,-0.003849,-0.006471,0.00147,-0.000975,0.001638,0.008487,-0.007218,0.002837,0.003102,-0.003032,-0.007769,0.012584,0.00871,-0.009741,0.005933,-0.001043,0.005058,0.003125,-0.01495,0.00077,0.009375,-0.008416,-0.005823,9.4e-05,0.00193,-0.002556,0.008732,0.002993,0.007235,0.012268,0.003947,-0.007214,0.000301,0.006588,0.005323,-0.01091,-0.004798,-0.002721,0.005076,...,-0.001324,0.000285,0.001267,-0.014059,-0.006223,0.010473,0.003779,-0.005423,0.014272,0.008564,-0.006266,0.004603,-0.01033,-0.009634,-0.010086,-0.003992,-0.009375,0.005952,-0.007841,0.015998,-0.00305,-0.001435,0.007151,-0.007673,-0.005177,0.01348,-0.01251,-0.003363,0.004625,0.000824,-0.004589,0.003114,-0.006059,0.009906,0.005619,0.004135,0.000198,-0.000769,-0.009406,0.202508
std,0.010363,0.013415,0.010921,0.015873,0.009555,0.012014,0.022982,0.018173,0.019877,0.010617,0.015469,0.010765,0.022672,0.016232,0.01492,0.016464,0.022672,0.009641,0.012911,0.020921,0.010004,0.01618,0.014413,0.01494,0.026348,0.00995,0.015505,0.00946,0.010982,0.01018,0.015792,0.018174,0.015514,0.01324,0.010281,0.01989,0.017012,0.011142,0.012874,0.013684,...,0.014662,0.012782,0.013827,0.014967,0.012829,0.01313,0.013014,0.013417,0.016738,0.014971,0.014646,0.015006,0.012694,0.013631,0.017762,0.015086,0.021716,0.01181,0.018838,0.015908,0.008611,0.009893,0.020267,0.010484,0.012781,0.015903,0.01179,0.010382,0.008487,0.012525,0.0146,0.010991,0.012867,0.011117,0.019811,0.014581,0.009632,0.010841,0.015362,0.285275
min,-0.070691,-0.066666,-0.26559,-0.152735,-0.208192,-0.104308,-0.117777,-0.103706,-0.155246,-0.292628,-0.436979,-0.075644,-0.087675,-0.102307,-0.229997,-0.50328,-0.576806,-0.04673,-0.142459,-0.148203,-0.103551,-0.23891,-0.064874,-0.373806,-0.237818,-0.053749,-0.076468,-0.038536,-0.185282,-0.090206,-0.129383,-0.137102,-0.079424,-0.095522,-0.038815,-0.231078,-0.29821,-0.057161,-0.071691,-0.032376,...,-0.109124,-0.320076,-0.356973,-0.080544,-0.088249,-0.053687,-0.041403,-0.092535,-0.05767,-0.051905,-0.22963,-0.097756,-0.073263,-0.135089,-0.531305,-0.057748,-0.111699,-0.152144,-0.079186,-0.047341,-0.139123,-0.172307,-0.060543,-0.101662,-0.34145,-0.094072,-0.1,-0.056204,-0.056218,-0.193806,-0.11795,-0.075355,-0.078805,-0.324372,-0.050936,-0.05719,-0.043312,-0.06753,-0.091156,0.0
25%,-0.011079,-0.010853,-0.011474,-0.007952,-0.004814,-0.002685,-0.000812,-0.017904,-0.008969,-0.002817,-0.010634,-0.012849,-6.3e-05,0.001181,-0.021052,-0.001977,-0.011205,-0.000999,-0.003017,-0.032014,-0.005282,0.000108,-0.016198,-0.014959,-0.012385,-0.002841,-0.012495,0.00422,-0.001538,0.001487,0.002155,-0.004565,-0.015677,-0.005801,-0.000474,-0.005233,-0.018708,-0.008643,-0.006791,-0.005274,...,-0.00802,-0.006159,-0.005837,-0.022141,-0.013453,0.001849,-0.004939,-0.014529,0.002448,-0.000869,-0.015687,-0.000219,-0.017344,-0.018519,-0.019238,-0.0133,-0.01872,-0.001213,-0.022246,0.004863,-0.00766,-0.005723,-0.005795,-0.01294,-0.011598,0.004719,-0.021211,-0.009018,0.001739,-0.007317,-0.012572,-0.001771,-0.012715,0.003986,-0.008146,-0.004087,-0.006027,-0.006192,-0.018764,0.0
50%,-0.005481,-0.004901,-0.006041,0.001448,0.000378,0.002618,0.009426,-0.00748,0.000248,0.00276,-0.003506,-0.006812,0.01215,0.009942,-0.007193,0.004802,-0.001151,0.002723,-1e-06,-0.007092,4.3e-05,0.007957,-0.009284,-0.00206,-0.000256,0.002211,-0.000212,0.007718,0.002713,0.005694,0.010456,0.004353,-0.008234,-0.000366,0.005348,0.004445,-0.011746,-0.004475,-2.7e-05,0.00061,...,-0.00176,-0.001899,0.002284,-0.013496,-0.006973,0.009612,-6.9e-05,-0.003907,0.011588,0.004993,-0.006297,0.005293,-0.009145,-0.010279,-0.010452,-0.004008,-0.006506,0.005182,-0.001174,0.014795,-0.002523,-0.00158,0.001695,-0.006791,-0.004716,0.01309,-0.010133,-0.001987,0.004683,-0.002103,-0.003698,0.004721,-0.005532,0.009637,-0.00138,0.002504,0.001156,0.000565,-0.007546,0.1
75%,-0.00065,0.000938,-0.001059,0.009638,0.004152,0.008066,0.019876,0.002666,0.013307,0.008758,0.001539,-0.00231,0.024918,0.017655,0.001938,0.01188,0.009442,0.011669,0.004268,0.000751,0.006304,0.01974,-0.003195,0.003344,0.011146,0.008299,0.007526,0.012572,0.007725,0.011219,0.020705,0.013863,-0.000924,0.005179,0.013614,0.015355,-0.005187,-0.001091,0.004419,0.015184,...,0.004728,0.004019,0.008609,-0.00671,-0.000841,0.019546,0.012722,0.003317,0.025991,0.01843,0.002669,0.012082,-0.001623,-0.000671,-0.002585,0.004939,0.002293,0.01254,0.00635,0.027365,0.001997,0.003582,0.020135,-0.000753,0.001737,0.021718,-0.004071,0.003199,0.00843,0.007631,0.003653,0.008996,0.000611,0.01647,0.021465,0.011687,0.006298,0.005553,-0.000275,0.3
max,0.088176,0.326622,0.055171,0.086859,0.039767,0.056958,0.396632,0.070086,0.310727,0.069572,0.104811,0.075358,0.350088,0.067385,0.048158,0.088309,0.154897,0.117727,0.075825,0.061759,0.058346,0.144205,0.059112,0.048551,0.61607,0.122026,0.278126,0.205555,0.109487,0.196951,0.125913,0.08604,0.258398,0.078407,0.0558,0.094394,0.083999,0.530721,0.235734,0.089713,...,0.08308,0.062764,0.103678,0.183487,0.140384,0.242949,0.151089,0.051398,0.205963,0.337747,0.056792,0.064683,0.0605,0.070605,0.06604,0.229546,0.546755,0.060158,0.215185,0.17858,0.032927,0.043692,0.233584,0.08667,0.052704,0.086813,0.041621,0.097169,0.035962,0.071929,0.245805,0.063359,0.06793,0.056513,0.134504,0.253556,0.186283,0.128153,0.131217,1.0
