In [24]:
!pip install --upgrade pip setuptools
!pip install --no-cache-dir gensim
!pip install numpy scipy

!pip install nltk


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m379.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.10.3-cp310-cp310-macosx_10_9_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m320.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hCollecting tqdm (from nltk)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m129.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading regex-2023.10.3-cp310-cp310-macosx_10_9_x86_64.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m204.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tqdm-4.66.1-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━

In [27]:
from gensim.models import KeyedVectors
import gensim.downloader

#word2vec_model = gensim.downloader.load('word2vec-google-news-300')
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin 2', binary=True)

In [55]:
import csv
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

#helper functions
def read_synonym_data(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]
    return data

def find_best_synonym(word, answer_options, model, topn=1):
    #get embedding vector for word
    try:
        # Get the embedding vector for the word
        word_vector = model[word]

        # Calculate the cosine similarity between the word and each answer option
        similarities = [(option, model.similarity(word, option.lower())) for option in answer_options]

        # Sort the options based on similarity, in descending order
        sorted_options = sorted(similarities, key=lambda x: x[1], reverse=True)

        # Return the option with the highest similarity
        best_guess = sorted_options[0][0]
        return best_guess
    # Word not in vocabulary
    except KeyError:
        return None
    
def generate_label(question_word, correct_answer, model_guess):
    if model_guess is None or correct_answer not in [question_word] + [model_guess]:
        return 'guess'
    elif model_guess == correct_answer:
        return 'correct'
    else:
        return 'wrong'

def process_synonym_test_data(data, model):
    correct_count = 0
    valid_count = 0
    results = []

    for entry in data:
        question_word = entry['question']
        correct_answer = entry['answer']
        guess_options = [entry[str(i)] for i in range(4)]  # options are in columns 0 to 3

        model_guess = find_best_synonym(question_word, guess_options, model)

        # Generate the label
        label = generate_label(question_word, correct_answer, model_guess)
        
        if label == 'correct':
            correct_count += 1
        if label != 'guess':
            valid_count += 1

        # Append the result to the results list
        results.append({
            'question_word': question_word,
            'correct_answer': correct_answer,
            'model_guess': model_guess,
            'label': label
        })

    return results, correct_count, valid_count
    
def write_to_csv(results, file_name):
    with open(file_name, 'w', newline='') as csvfile:
        csv_info = ['question_word', 'correct_answer', 'model_guess', 'label']
        writer = csv.DictWriter(csvfile, fieldnames=csv_info)
        writer.writeheader()
        for result in results:
            writer.writerow(result)
    
    

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [56]:
synonym_test_file = '/Users/briannam/Downloads/A2-DataSet/synonym.csv'
#home computer: 'Users/briannam/Downloads/A2-Dataset/synonym.csv'
#uni computer: 'C:/Users/b_malpar/Downloads/A2-DataSet/synonym.csv'
synonym_test_data = read_synonym_data(synonym_test_file)

results, correct_count, valid_count = process_synonym_test_data(synonym_test_data, word2vec_model)

accuracy = correct_count / valid_count if valid_count > 0 else 0
    
# write results to csv file
write_to_csv(results, 'word2vec-google-news-300-details.csv')

#write analysis to csv file
with open('analysis.csv', 'w', newline='') as csvfile:
    csv_info = ['model_name', 'vocab_size', 'C', 'V', 'accuracy']
    writer = csv.DictWriter(csvfile, fieldnames=csv_info)
    writer.writeheader()

    model_name = 'word2vec-google-news-300'
    vocabulary_size = 3000000

    writer.writerow({
        'model_name': model_name,
        'vocab_size': vocabulary_size,
        'C': correct_count,
        'V': valid_count,
        'accuracy': accuracy
    })

