In [1]:
import sent2vec
from nltk.tokenize import TweetTokenizer
import numpy as np
import random
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import re
from sklearn.metrics.pairwise import cosine_similarity as cos
from scipy.stats import pearsonr, spearmanr

In [2]:
# load sentence embedding model
model_path = '../models/'
model_wi_1 = sent2vec.Sent2vecModel()
model_wi_1.load_model(model_path + 'wiki_unigrams.bin')

In [3]:
def sort_length_embedding(sents1, sents2, labels, model):
    """
    sents: array, n sentences
    labels: array, label of every movie review
    model: embedding model
    return emb: embedding vectors of sorted sentences
           sorted_labels
           length
    """
    dict_length = {}
    tknzr = TweetTokenizer()
    n = len(sents1)
    tokenized_sents1 = []
    tokenized_sents2 = []

    for i in range (n):
        string1 = sents1[i]
        string2 = sents2[i]
        string1 = re.sub(r'[^\w\s]','',string1)
        string2 = re.sub(r'[^\w\s]','',string2)
        sent_list1 = tknzr.tokenize(string1)
        sent_list2 = tknzr.tokenize(string2)
        dict_length[i] = (len(sent_list1)+len(sent_list2))/2.0
        tokenized_sents1.append(' '.join(sent_list1).lower())
        tokenized_sents2.append(' '.join(sent_list2).lower())
    
    sorted_by_value = sorted(dict_length.items(), key=lambda kv: kv[1])
    
    sorted_sents1 = []
    sorted_sents2 = []
    sorted_labels = []
    length = []
    
    for item in sorted_by_value:
        sorted_sents1.append(tokenized_sents1[item[0]])
        sorted_sents2.append(tokenized_sents2[item[0]])
        sorted_labels.append(labels[item[0]])
        length.append(item[1])
    emb1 = model.embed_sentences(sorted_sents1)
    emb2 = model.embed_sentences(sorted_sents2)
    
    return emb1, emb2, sorted_labels, length


In [46]:
# evaluate STS using cosine similarity
# and compare the results with the gold standard.
# sentsets: sentence datasets:
#           deft-forum, deft-news, headlines, images, OnWM, tweet-news
def STS_eval(sentset, model):
    """
    Evaluate the similarities of 
    :param sentset: string, sentence dataset
    :param model: sentence embedding model
    :return: cosine similarity, of all pairs of sentences
             pearson & spearman coefficients compared to gold standard
    """
    sent_file = open('sts-en-test-gs-2014/STS.input.'+sentset+'.txt')
    sent_data = sent_file.readlines()
    sent_file.close()
    gs_file = open('sts-en-test-gs-2014/STS.gs.'+sentset+'.txt')
    gs_data = np.array(gs_file.readlines(), dtype=float)
    gs_file.close()
    splited_sent = []
    n = len(sent_data)
    for i in range(n):
        splited_sent.append(re.split(r'\t+', sent_data[i]))
    splited_sent = np.array(splited_sent)
    sent_1 = splited_sent[:,0]
    sent_2 = splited_sent[:,1]
    x_1, x_2, y, ls = sort_length_embedding(sent_1, sent_2, gs_data, model)
    
    s1 = x_1[:81]
    s2 = x_2[:81]
    y1 = y[:81]
    c1 = []
    
    s1_2 = x_1[81:162]
    s2_2 = x_2[81:162]
    y2 = y[81:162]
    c2 = []
    
    s1_3 = x_1[162:227]
    s2_3 = x_2[162:227]
    y3 = y[162:227]
    c3 = []
    
    s1_4 = x_1[227:]
    s2_4 = x_2[227:]
    y4 = y[227:]
    c4 = []
    
    
    
    pearsons = []
    spearmanrs = []
    

    for i in range(len(s1)):
        v1 = s1[i]
        v2 = s2[i]
        cos_i = cos([v1], [v2])
        c1.append(cos_i[0][0])
    pearsons.append(pearsonr(c1, y1)[0])
    spearmanrs.append(spearmanr(c1, y1)[0])
    
    for i in range(len(y2)):
        v1 = s1_2[i]
        v2 = s2_2[i]
        cos_i = cos([v1], [v2])
        c2.append(cos_i[0][0])
    pearsons.append(pearsonr(c2, y2)[0])
    spearmanrs.append(spearmanr(c2, y2)[0])
    
    for i in range(len(y3)):
        v1 = s1_3[i]
        v2 = s2_3[i]
        cos_i = cos([v1], [v2])
        c3.append(cos_i[0][0])
    pearsons.append(pearsonr(c3, y3)[0])
    spearmanrs.append(spearmanr(c3, y3)[0])
    
    for i in range(len(y4)):
        v1 = s1_4[i]
        v2 = s2_4[i]
        cos_i = cos([v1], [v2])
        c4.append(cos_i[0][0])
    pearsons.append(pearsonr(c4, y4)[0])
    spearmanrs.append(spearmanr(c4, y4)[0])
        
    
    return pearsons, spearmanrs
    
    

In [47]:
pearson_news, spearman_news = STS_eval('deft-news', model_wi_1)

print('spearman correlation with gs:',  spearman_news)
print('pearson correlation with gs:', pearson_news)

spearman correlation with gs: [0.7077972836577799, 0.721828545619389, 0.63255127702153, 0.29036327210484564]
pearson correlation with gs: [0.6940241048974188, 0.7504630506091284, 0.7369924567923407, 0.37552997030724206]
