In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Experiments/check-standard-models
!ls

Mounted at /content/drive
/content/drive/MyDrive/Experiments/check-standard-models
combined.csv			      wordsim_similarity_goldstandard.txt
wordsim_relatedness_goldstandard.txt


In [None]:
import gensim.downloader as api
import math
from scipy.stats import spearmanr
from statistics import mean
import csv

# Models

**Word2Vec Google News**: The 'word2vec-google-news-300' is a pre-trained Word2Vec model by Google. It was trained on a part of the Google News dataset, covering approximately 3 million words and phrases. Such a large volume of data enables the capturing of many semantic relationships between words.

Word vectors are 300-dimensional.

In [None]:
google_model = api.load('word2vec-google-news-300')



**FastText:** Developed by Facebook's AI Research lab (FAIR), FastText embeddings are trained on Wikipedia and are unique because they are based on the morphemes of words (subword information) rather than whole words, which allows them to understand the semantics of out-of-vocabulary (OOV) words.

In [None]:
fasttext_model = api.load('fasttext-wiki-news-subwords-300')



**ConceptNet Numberbatch:** Numberbatch embeddings combine information from multiple sources, including ConceptNet, word2vec, GloVe, and OpenSubtitles 2016, to create word embeddings that have been demonstrated to perform strongly in bias evaluation and similarity tasks.

In [None]:
numberbatch_model = api.load('conceptnet-numberbatch-17-06-300')



**GloVe:** Global Vectors for Word Representation. These embeddings were trained on various corpora (Wikipedia 2014 + Gigaword 5, Common Crawl, and Twitter) by the Stanford NLP Group. They apply a global log-bilinear regression model that combines the advantages of the two major model families in the literature: global matrix factorization and local context window methods.

In [None]:
glove_model = api.load('glove-wiki-gigaword-300')



In [None]:
models = {"Word2Vec-Google-News": google_model, "FastText": fasttext_model, "ConceptNet-Numberbatch": numberbatch_model, "GloVe": glove_model}

# Functions

In [None]:
def cosine_similarity(v1, v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i];
        y = v2[i]
        sumxx += x * x
        sumyy += y * y
        sumxy += x * y
    return sumxy / math.sqrt(sumxx * sumyy)

In [None]:
def scorefunction1(embed, specify_lang):

  with open('combined.csv') as csvfile:
    filein = csv.reader(csvfile)

    consim = []
    humansim = []

    inv_line = True

    for eles in filein:

      if inv_line:
        inv_line = False
        continue

      word1 = eles[0]
      word2 = eles[1]
      humansim.append(float(eles[2]) / 10 * 2 - 1)

      try:
        if specify_lang:
          word1 = "/c/en/" + word1
          word2 = "/c/en/" + word2
        value1 = embed[word1]
        value2 = embed[word2]
        score = cosine_similarity(value1, value2)
        consim.append(score)
      except KeyError:
        consim.append(-1)

  cor1, pvalue1 = spearmanr(humansim, consim)

  return cor1

In [None]:
def scorefunction2(embed, specify_lang):
  
  lines = open('wordsim_similarity_goldstandard.txt', 'r').readlines()

  consim = []
  humansim = []

  for line in lines:
    eles = line.strip().split()
    word1 = eles[0]
    word2 = eles[1]
    humansim.append(float(eles[2]) / 10 * 2 - 1)

    try:
      if specify_lang:
        word1 = "/c/en/" + word1
        word2 = "/c/en/" + word2
      value1 = embed[word1]
      value2 = embed[word2]
      score = cosine_similarity(value1, value2)
      consim.append(score)
    except KeyError:
      consim.append(-1)

  cor2, pvalue2 = spearmanr(humansim, consim)

  return cor2

In [None]:
def scorefunction3(embed, specify_lang):

  lines = open('wordsim_relatedness_goldstandard.txt', 'r').readlines()

  consim = []
  humansim = []

  for line in lines:
    eles = line.strip().split()

    word1 = eles[0]
    word2 = eles[1]
    humansim.append(float(eles[2]) / 10 * 2 - 1)

    try:
      if specify_lang:
        word1 = "/c/en/" + word1
        word2 = "/c/en/" + word2
      value1 = embed[word1]
      value2 = embed[word2]
      score = cosine_similarity(value1, value2)
      consim.append(score)
    except KeyError:
      consim.append(-1)

  cor3, pvalue3 = spearmanr(humansim, consim)

  return cor3

In [None]:
def get_scores(word_embeddings, model_name):

  specify_lang = (model_name == "ConceptNet-Numberbatch")


  sp1 = scorefunction1(word_embeddings, specify_lang)
  sp2 = scorefunction2(word_embeddings, specify_lang)
  sp3 = scorefunction3(word_embeddings, specify_lang)

  return (sp1, sp2, sp3, mean([sp1, sp2, sp3]))

# Testing

In [None]:
for model_name in list(models.keys()):

  sp1, sp2, sp3, avg = get_scores(models[model_name], model_name)

  print("Model: %s" % (model_name))
  print("Correlation with Wordsim353: %.4f" % (sp1))
  print("Correlation with Wordsim Similarity Goldstandard: %.4f" % (sp2))
  print("Correlation with Wordsim Relatedness Goldstandard: %.4f" % (sp3))
  print("Average of all: %.4f" % (avg))
  print("----------")


Model: Word2Vec-Google-News
Correlation with Wordsim353: 0.7000
Correlation with Wordsim Similarity Goldstandard: 0.7717
Correlation with Wordsim Relatedness Goldstandard: 0.6355
Average of all: 0.7024
----------
Model: FastText
Correlation with Wordsim353: 0.6943
Correlation with Wordsim Similarity Goldstandard: 0.8235
Correlation with Wordsim Relatedness Goldstandard: 0.6206
Average of all: 0.7128
----------
Model: ConceptNet-Numberbatch
Correlation with Wordsim353: 0.7040
Correlation with Wordsim Similarity Goldstandard: 0.8015
Correlation with Wordsim Relatedness Goldstandard: 0.6109
Average of all: 0.7054
----------
Model: GloVe
Correlation with Wordsim353: 0.5109
Correlation with Wordsim Similarity Goldstandard: 0.6387
Correlation with Wordsim Relatedness Goldstandard: 0.4358
Average of all: 0.5285
----------


# Save models

In [None]:
for model_name in list(models.keys()):
  models[model_name].save("models/%s.model" % (model_name.replace(" ", "-")))

# Disconnect from the runtime

In [None]:
from google.colab import runtime
runtime.unassign()