In [5]:
!pip install --upgrade pip setuptools
!pip install --no-cache-dir gensim
!pip install numpy scipy

!pip install nltk
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-macosx_10_15_universal2.whl size=352899 sha256=a6d04ab93f0eaf67a3b9290f207bf3f0f831ec575fff7f1168716b944c45e18e
  Stored in directory: /Users/briannam/Library/Caches/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [14]:
from gensim.models import KeyedVectors
import gensim.downloader

gigaword500_model = KeyedVectors.load_word2vec_format('/Users/briannam/Downloads/11/gigaword500.bin', binary=True)
#vocab size:261794
#vector size:300


engCoNLL17_model = KeyedVectors.load_word2vec_format('/Users/briannam/Downloads/40/model.bin', binary=True)
#vocab size:4027169
#vector size:100


In [29]:
import csv
import random
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

#helper functions
def read_synonym_data(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]
    return data

def find_best_synonym(word, answer_options, model, topn=1):
    try:
        # calculate the cosine similarity between the word and each answer option
        similarities = [(option, model.similarity(word, option.lower())) for option in answer_options]

        sorted_options = sorted(similarities, key=lambda x: x[1], reverse=True)

        # return highest similarity
        best_guess = sorted_options[0][0]
        return best_guess

    except KeyError:
        return None
    
def generate_label(question_word, correct_answer, model_guess, model):
    if (
        correct_answer not in [question_word] + [model_guess]
        and (model_guess not in model.key_to_index or question_word not in model.key_to_index)
    ):
        return "guess"
    elif model_guess == correct_answer:
        return "correct"
    else:
        return "wrong"


def process_synonym_test_data(data, model):
    correct_count = 0
    valid_count = 0
    results = []

    for entry in data:
        question_word = entry['question']
        correct_answer = entry['answer']
        guess_options = [entry[str(i)] for i in range(4)]  # Options are in columns 0 to 3

        model_guess = find_best_synonym(question_word, guess_options, model)
        
        # if correct answer not in model, randomly select one as system guess
        if correct_answer not in [question_word] + [model_guess]:
            model_guess = random.choice(guess_options)

        # generate label
        label = generate_label(question_word, correct_answer, model_guess, model)

        if label == 'correct':
            correct_count += 1
        if label != 'guess':
            valid_count += 1

        results.append({
            'question_word': question_word,
            'correct_answer': correct_answer,
            'model_guess': model_guess,
            'label': label
        })

    return results, correct_count, valid_count
    
def write_to_csv(results, file_name):
    with open(file_name, 'w', newline='') as csvfile:
        csv_info = ['question_word', 'correct_answer', 'model_guess', 'label']
        writer = csv.DictWriter(csvfile, fieldnames=csv_info)
        writer.writeheader()
        for result in results:
            writer.writerow(result)  

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [30]:
synonym_test_file = '/Users/briannam/Downloads/A2-DataSet/synonym.csv'
#home computer: 'Users/briannam/Downloads/A2-Dataset/synonym.csv'
#uni computer: 'C:/Users/b_malpar/Downloads/A2-DataSet/synonym.csv'
synonym_test_data = read_synonym_data(synonym_test_file)

results, correct_count, valid_count = process_synonym_test_data(synonym_test_data, gigaword500_model)

accuracy = correct_count / valid_count if valid_count > 0 else 0
    
# write results to csv file
write_to_csv(results, 'gigaword500_model-details.csv')

#write analysis to csv file
with open('analysis_gigaword.csv', 'w', newline='') as csvfile:
    csv_info = ['model_name', 'vocab_size', 'C', 'V', 'accuracy']
    writer = csv.DictWriter(csvfile, fieldnames=csv_info)
    writer.writeheader()

    model_name = 'gigaword500-5th-edition'
    vocabulary_size = 261794

    writer.writerow({
        'model_name': model_name,
        'vocab_size': vocabulary_size,
        'C': correct_count,
        'V': valid_count,
        'accuracy': accuracy
    })


In [31]:
results, correct_count, valid_count = process_synonym_test_data(synonym_test_data, engCoNLL17_model)

accuracy = correct_count / valid_count if valid_count > 0 else 0
    
# write results to csv file
write_to_csv(results, 'engCoNLL17_model-details.csv')

#write analysis to csv file
with open('analysis_engCoNLL17.csv', 'w', newline='') as csvfile:
    csv_info = ['model_name', 'vocab_size', 'C', 'V', 'accuracy']
    writer = csv.DictWriter(csvfile, fieldnames=csv_info)
    writer.writeheader()

    model_name = 'English-CoNLL17'
    vocabulary_size = 4027169

    writer.writerow({
        'model_name': model_name,
        'vocab_size': vocabulary_size,
        'C': correct_count,
        'V': valid_count,
        'accuracy': accuracy
    })