In [1]:
import fasttext
import numpy as np
import pickle
from gensim import models
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
import scipy.stats as stats

# load embeddings
w = models.KeyedVectors.load_word2vec_format(
    'embedding_files/GoogleNews-vectors-negative300.bin', binary=True)

ft = fasttext.load_model('embedding_files/crawl-300d-2M-subword.bin')




In [2]:
# skip next cell for faster runtime and to load models here if you do not wish to retrain the models
linear_model_w = pickle.load(open('silver_standard/model_savefiles/linear_model_w.sav', 'rb'))
linear_model_ft = pickle.load(open('silver_standard/model_savefiles/linear_model_ft.sav', 'rb'))
mlp_model_w = pickle.load(open('silver_standard/model_savefiles/mlp_model_w.sav', 'rb'))
mlp_model_ft = pickle.load(open('silver_standard/model_savefiles/mlp_model_ft.sav', 'rb'))
knn_model_ft = pickle.load(open('silver_standard/model_savefiles/knn_model_ft.sav', 'rb'))
knn_model_w = pickle.load(open('silver_standard/model_savefiles/knn_model_w.sav', 'rb'))
svr_model_w = pickle.load(open('silver_standard/model_savefiles/svr_model_w.sav', 'rb'))
svr_model_ft = pickle.load(open('silver_standard/model_savefiles/svr_model_ft.sav', 'rb'))


In [3]:
# skip this cell if loaded models previously
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

# train with train and test data
x_w = [] #  for embeddings w2vec
y= [] # for scores
x_ft = [] # for fasttext
words_text = [] # for words

with open('silver_standard/predictions/predictions_overall.txt', 'r') as f:  # load in silver standard file
    for line in f:
        word, v = line.split('\t')
        # w2vec embedding
        try:
            emb = w[word]
        except KeyError:
            emb = np.zeros(300) # if embedding not found, use zero embedding
        x_w.append(emb)

        # fasttext embedding
        embedding = ft.get_word_vector(word)
        x_ft.append(embedding)

        y.append(float(v)) # use valency as target
        words_text.append(word)

    X_w = np.array(x_w)
    X_ft = np.array(x_ft)

    # random state number ensures that split is identical
    X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_w, y, test_size=0.1, random_state=20, shuffle=True)
    X_train_ft, X_test_ft, y_train_ft, y_test_ft = train_test_split(X_ft, y, test_size=0.1, random_state=20, shuffle=True)


    # w2vec models
    # Linear Regression
    linear_model_w = LinearRegression()
    linear_model_w.fit(X_train_w, y_train_w)
    y_pred_linear_w = linear_model_w.predict(np.array(X_test_w))

    # MLP Regressor
    mlp_model_w =  MLPRegressor(random_state=1, max_iter=200)
    mlp_model_w.fit(X_train_w, y_train_w) # fit model w2vec
    y_pred_mlp_w = mlp_model_w.predict(np.array(X_test_w))

    # KNN Regressor
    knn_model_w = KNeighborsRegressor(n_neighbors=10)
    knn_model_w.fit(X_train_w, y_train_w)

    # SVR Regressor
    svr_model_w = SVR(kernel='rbf', C=1e3, gamma=0.1)
    svr_model_w.fit(X_train_w, y_train_w)


    # fasttext models
    # Linear Regression
    linear_model_ft = LinearRegression()
    linear_model_ft.fit(X_train_ft, y_train_ft)
    y_pred_linear_ft = linear_model_ft.predict(X_test_ft)

    # MLP Regressor
    mlp_model_ft =  MLPRegressor(random_state=1, max_iter=200)
    mlp_model_ft.fit(X_train_ft, y_train_ft)
    y_pred_mlp_ft = mlp_model_ft.predict(X_test_ft)

    # KNN Regressor
    knn_model_ft = KNeighborsRegressor(n_neighbors=5)
    knn_model_ft.fit(X_train_ft, y_train_ft)

    # SVM Regressor
    # svr_model_ft = SVR(kernel='rbf')
    svr_model_ft = SVR(kernel='rbf', C=1e3, gamma=0.1)
    svr_model_ft.fit(X_train_ft, y_train_ft)


print('FASTTEXT')
print('on test set')
mean_squared_error = mean_squared_error
table = [['', 'linear', 'mlp', 'knn', 'svr'], ['mse', mean_squared_error(y_test_ft, linear_model_ft.predict(X_test_ft)), mean_squared_error(y_test_ft, mlp_model_ft.predict(X_test_ft)), mean_squared_error(y_test_ft, knn_model_ft.predict(X_test_ft)), mean_squared_error(y_test_ft, svr_model_ft.predict(X_test_ft))], ['r2 score', r2_score(y_test_ft, linear_model_ft.predict(X_test_ft)), r2_score(y_test_ft, mlp_model_ft.predict(X_test_ft)), r2_score(y_test_ft, knn_model_ft.predict(X_test_ft)), r2_score(y_test_ft, svr_model_ft.predict(X_test_ft))]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
print('on training set')
table = [['', 'linear', 'mlp', 'knn', 'svr'], ['mse', mean_squared_error(y_train_ft, linear_model_ft.predict(X_train_ft)), mean_squared_error(y_train_ft, mlp_model_ft.predict(X_train_ft)), mean_squared_error(y_train_ft, knn_model_ft.predict(X_train_ft)), mean_squared_error(y_train_ft, svr_model_ft.predict(X_train_ft))], ['r2 score', r2_score(y_train_ft, linear_model_ft.predict(X_train_ft)), r2_score(y_train_ft, mlp_model_ft.predict(X_train_ft)), r2_score(y_train_ft, knn_model_ft.predict(X_train_ft)), r2_score(y_train_ft, svr_model_ft.predict(X_train_ft))]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

print('W2VEC')
print('on test set')
table = [['', 'linear', 'mlp', 'knn', 'svr'], ['mse', mean_squared_error(y_test_w, linear_model_w.predict(X_test_w)), mean_squared_error(y_test_w, mlp_model_w.predict(X_test_w)), mean_squared_error(y_test_w, knn_model_w.predict(X_test_w)), mean_squared_error(y_test_w, svr_model_w.predict(X_test_w))], ['r2 score', r2_score(y_test_w, linear_model_w.predict(X_test_w)), r2_score(y_test_w, mlp_model_w.predict(X_test_w)), r2_score(y_test_w, knn_model_w.predict(X_test_w)), r2_score(y_test_w, svr_model_w.predict(X_test_w))]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
print('on training set')
table = [['', 'linear', 'mlp', 'knn', 'svr'], ['mse', mean_squared_error(y_train_w, linear_model_w.predict(X_train_w)), mean_squared_error(y_train_w, mlp_model_w.predict(X_train_w)), mean_squared_error(y_train_w, knn_model_w.predict(X_train_w)), mean_squared_error(y_train_w, svr_model_w.predict(X_train_w))], ['r2 score', r2_score(y_train_w, linear_model_w.predict(X_train_w)), r2_score(y_train_w, mlp_model_w.predict(X_train_w)), r2_score(y_train_w, knn_model_w.predict(X_train_w)), r2_score(y_train_w, svr_model_w.predict(X_train_w))]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))


In [3]:
# average embeddings, then produce score
# here, it would be possible to tweak the score the model gives, e.g. by inverting score if there's a negation
def process_phrase(phr, model, model_type='w'):
    model_score = 0
    embedding_sum = np.zeros(300)
    wrds = phr.split(' ')
    if model_type == 'w':
        for wrd in wrds:
            try:
                em = w[wrd]
            except KeyError: # zero vector for unknown word
                em = np.zeros(300, dtype=np.float32)
            embedding_sum += em
        embedding_avg = embedding_sum / len(wrds) # average embedding
        embedding_avg = embedding_avg.reshape(1, -1)
        embedding_sum = embedding_sum.reshape(1, -1)
        model_score =  model.predict(embedding_avg) # either write embedding_avg or embedding_sum depending on what you want to calculate
    if model_type == 'ft':
        for wrd in wrds:
            em = ft.get_word_vector(wrd)
            em = np.array(em)
            embedding_sum += em
        embedding_avg = embedding_sum / len(wrds) # average embedding
        embedding_avg = embedding_avg.reshape(1, -1)
        embedding_sum = embedding_sum.reshape(1, -1)
        model_score =  model.predict(embedding_avg) # either write embedding_avg or embedding_sum depending on what you want to calculate
    return model_score[0]

In [4]:
def pred_kendall(scores_phr_pred, original_scores_phr, outfile):
    # writes predictions into file and calculates kendall
    scores_idx_split = []
    scores_phr_pred.sort(key=lambda s: s[1], reverse=True)
    for triplet in scores_phr_pred:
        scores_idx_split.append(triplet[2])
        outfile.write(triplet[0] + '\t' + str(triplet[1]) + '\n')

    y_idx = []
    for triple in original_scores_phr:
        y_idx.append(triple[2])

    # kendall tau
    tau, pvalue = stats.kendalltau(scores_idx_split, y_idx)
    # calculate spearman
    spearman, pvalue_s = stats.spearmanr(scores_idx_split, y_idx)
    return f'\nkendall: {tau}\t pvalue: {pvalue}\nspearman: {spearman}\tpvalue: {pvalue_s}'

def pred_write_only(scores_phr_pred, file):
    # writes predictions into file
    scores_phr_pred.sort(key=lambda s: s[1], reverse=True)
    for triplet in scores_phr_pred:
        file.write(triplet[0].rstrip('\n') + '\t' + str(triplet[1]) + '\n')


In [8]:
def calculate_scores(model, filepath, model_type: str = 'w'):
    phrases_and_scores = [] # original scores from training examples
    scores = [] # scores from the model averaging score
    idx = 0
    with open(filepath) as file:
        for line in file:
            phrase, score = line.rstrip().split('\t')
            phrases_and_scores.append((phrase, score, idx))
            score = process_phrase(phrase, model, model_type)
            scores.append((phrase, score, idx))

            idx += 1
        return scores, phrases_and_scores


with open('silver_standard/rankings/scl_nma/lin_w.txt', 'w+') as lin_w, \
        open('silver_standard/rankings/scl_nma/mlp_w.txt', 'w+') as mlp_w, \
        open('silver_standard/rankings/scl_nma/knn_w.txt', 'w+') as knn_w, \
        open('silver_standard/rankings/scl_nma/svr_w.txt', 'w+') as svr_w, \
        open('silver_standard/rankings/scl_nma/lin_ft.txt', 'w+') as lin_ft, \
        open('silver_standard/rankings/scl_nma/mlp_ft.txt', 'w+') as mlp_ft, \
        open('silver_standard/rankings/scl_nma/knn_ft.txt', 'w+') as knn_ft, \
        open('silver_standard/rankings/scl_nma/svr_ft.txt', 'w+') as svr_ft:
    # testfilepath = 'silver_standard/scl_nma/SCL-NMA-single.txt' # single words only
    #testfilepath = 'silver_standard/scl_nma/SCL-NMA-multiple.txt' # multi word phrases
    # testfilepath = 'silver_standard/scl_nma/SCL-NMA.txt' # overall
    testfilepath = 'silver_standard/scl_nma/SemEval2016-overall.txt' # overall
    # testfilepath = 'silver_standard/scl_nma/SCL-NMA-single.txt' # single words only
    # testfilepath = 'silver_standard/scl_nma/SCL-NMA-multiple.txt' # multi word phrases

    print('w2vec', '\n')
    scores, phrases_and_scores = calculate_scores(linear_model_w, testfilepath, 'w')
    print('linear', pred_kendall(scores, phrases_and_scores, lin_w), '\n')

    scores, phrases_and_scores = calculate_scores(mlp_model_w, testfilepath, 'w')
    print('mlp', pred_kendall(scores, phrases_and_scores,mlp_w), '\n')

    scores, phrases_and_scores = calculate_scores(knn_model_w, testfilepath, 'w')
    print('knn', pred_kendall(scores, phrases_and_scores,knn_w), '\n')

    scores, phrases_and_scores = calculate_scores(svr_model_w, testfilepath, 'w')
    print('svr', pred_kendall(scores, phrases_and_scores,svr_w), '\n\n')


    print('fasttext', '\n')
    scores, phrases_and_scores = calculate_scores(linear_model_ft, testfilepath, 'ft')
    print('linear', pred_kendall(scores, phrases_and_scores, lin_ft), '\n')

    scores, phrases_and_scores = calculate_scores(mlp_model_ft, testfilepath, 'ft')
    print('mlp', pred_kendall(scores, phrases_and_scores,mlp_ft), '\n')

    scores, phrases_and_scores = calculate_scores(knn_model_ft, testfilepath, 'ft')
    print('knn', pred_kendall(scores, phrases_and_scores,knn_ft), '\n')

    scores, phrases_and_scores = calculate_scores(svr_model_ft, testfilepath, 'ft')
    print('svr', pred_kendall(scores, phrases_and_scores,svr_ft), '\n')



w2vec 

linear 
kendall: 0.5484872699097835	 pvalue: 0.0
spearman: 0.7524495245581392	pvalue: 0.0 

mlp 
kendall: 0.524574920942101	 pvalue: 0.0
spearman: 0.7232058536682534	pvalue: 0.0 

knn 
kendall: 0.3953533389464889	 pvalue: 8.318771747688187e-216
spearman: 0.5397130083620549	pvalue: 2.1192639751423615e-211 

svr 
kendall: 0.549138988421526	 pvalue: 0.0
spearman: 0.746536148761682	pvalue: 0.0 


fasttext 

linear 
kendall: 0.4214986920939037	 pvalue: 5.277350794893067e-245
spearman: 0.6040903897166524	pvalue: 4.4440540043726076e-278 

mlp 
kendall: 0.4455678416753048	 pvalue: 1.5633814014785723e-273
spearman: 0.6278847420270555	pvalue: 8.483362394529516e-307 

knn 
kendall: 0.4806778996174729	 pvalue: 0.0
spearman: 0.6671978858403246	pvalue: 0.0 

svr 
kendall: 0.4333210497673401	 pvalue: 8.079058769365877e-259
spearman: 0.6108247737819159	pvalue: 5.854223730161609e-286 



In [None]:
def calculate_score(infilepath, model, model_type: str = 'w'):
    # calculate scores for file without initial scores
    scores = []
    idx = 0
    with open(infilepath, 'r') as file:
        for line in file:
            score = process_phrase(line.rstrip(), model, model_type)
            scores.append((line, score))
            idx += 1
        return scores


# sample without given scores
with open('silver_standard/rankings/bbc_news/lin_w.txt', 'w+') as lin_w, \
        open('silver_standard/rankings/bbc_news/mlp_w.txt', 'w+') as mlp_w, \
        open('silver_standard/rankings/bbc_news/knn_w.txt', 'w+') as knn_w, \
        open('silver_standard/rankings/bbc_news/svr_w.txt', 'w+') as svr_w, \
        open('silver_standard/rankings/bbc_news/lin_ft.txt', 'w+') as lin_ft, \
        open('silver_standard/rankings/bbc_news/mlp_ft.txt', 'w+') as mlp_ft, \
        open('silver_standard/rankings/bbc_news/knn_ft.txt', 'w+') as knn_ft, \
        open('silver_standard/rankings/bbc_news/svr_ft.txt', 'w+') as svr_ft:
    infilepath = 'silver_standard/np_extraction/nps_news_clean_300_examples.txt'
    save_paths = [lin_w, mlp_w, knn_w, svr_w, lin_ft, mlp_ft, knn_ft, svr_ft]

    model_saves = [linear_model_w, mlp_model_w, knn_model_w, svr_model_w,
                   linear_model_ft, mlp_model_ft, knn_model_ft, svr_model_ft]

    idx = 0
    for model, savepath in zip(model_saves, save_paths):
        if idx <= 3:
            scores = calculate_score(infilepath, model, 'w')
        else:
            scores = calculate_score(infilepath, model, 'ft')
        pred_write_only(scores, savepath)
        idx += 1



In [10]:
# pickle.dump(linear_model_w, open('silver_standard/model_savefiles/linear_model_w.sav', 'wb'))
# pickle.dump(linear_model_ft, open('silver_standard/model_savefiles/linear_model_ft.sav', 'wb'))
# pickle.dump(mlp_model_ft, open('silver_standard/model_savefiles/mlp_model_ft.sav', 'wb'))
# pickle.dump(mlp_model_w, open('silver_standard/model_savefiles/mlp_model_w.sav', 'wb'))
# pickle.dump(knn_model_w, open('silver_standard/model_savefiles/knn_model_w.sav', 'wb'))
# pickle.dump(knn_model_ft, open('silver_standard/model_savefiles/knn_model_ft.sav', 'wb'))
# pickle.dump(svr_model_w, open('silver_standard/model_savefiles/svr_model_w.sav', 'wb'))
# pickle.dump(svr_model_ft, open('silver_standard/model_savefiles/svr_model_ft.sav', 'wb'))