In [33]:
!pip install --upgrade pip setuptools
!pip install --no-cache-dir gensim
!pip install numpy scipy

!pip install nltk
!pip install fasttext

!pip install requests
!pip install certifi



In [34]:
#Explicitly set the path to the SSL certificate file in your Python script or notebook.
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [35]:
import gensim
from gensim import models
from gensim.models import Word2Vec

w = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [36]:
import nltk
import ssl
from nltk.tokenize import sent_tokenize, word_tokenize
import requests
from bs4 import BeautifulSoup
import csv
import random

ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt') 

#helper functions
def read_book_from_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading content from {file_path}: {e}")
        return None

# Function to preprocess text and tokenize into sentences(list of sentences)
def preprocess_text(text):
    sentences = sent_tokenize(text)
    tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    return tokenized_sentences

def read_synonym_data(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]
    return data

def find_best_synonym(word, answer_options, model, topn=1):
    try:
        # calculate the cosine similarity between the word and each answer option
        similarities = [(option, model.wv.similarity(word, option.lower())) for option in answer_options]

        sorted_options = sorted(similarities, key=lambda x: x[1], reverse=True)

        # return highest similarity
        best_guess = sorted_options[0][0]
        return best_guess

    except KeyError:
        return None
    
def generate_label(question_word, correct_answer, model_guess, model):
    if (
        correct_answer not in [question_word] + [model_guess]
        and ((model_guess not in model.wv.key_to_index or question_word not in model.wv.key_to_index))
    ):
        return "guess"
    elif model_guess == correct_answer:
        return "correct"
    else:
        return "wrong"


def process_synonym_test_data(data, model):
    correct_count = 0
    valid_count = 0
    results = []

    for entry in data:
        question_word = entry['question']
        correct_answer = entry['answer']
        guess_options = [entry[str(i)] for i in range(4)]  # Options are in columns 0 to 3

        model_guess = find_best_synonym(question_word, guess_options, model)
        
        # if correct answer not in model, randomly select one as system guess
        if correct_answer not in [question_word] + [model_guess]:
            model_guess = random.choice(guess_options)

        # generate label
        label = generate_label(question_word, correct_answer, model_guess, model)

        if label == 'correct':
            correct_count += 1
        if label != 'guess':
            valid_count += 1

        results.append({
            'question_word': question_word,
            'correct_answer': correct_answer,
            'model_guess': model_guess,
            'label': label
        })

    return results, correct_count, valid_count
    
def write_to_csv(results, file_name):
    with open(file_name, 'w', newline='') as csvfile:
        csv_info = ['question_word', 'correct_answer', 'model_guess', 'label']
        writer = csv.DictWriter(csvfile, fieldnames=csv_info)
        writer.writeheader()
        for result in results:
            writer.writerow(result)  


[nltk_data] Downloading package punkt to /Users/briannam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
synonym_test_file = '/Users/briannam/Downloads/A2-DataSet/synonym.csv'
#home computer: 'Users/briannam/Downloads/A2-Dataset/synonym.csv'
#uni computer: 'C:/Users/b_malpar/Downloads/A2-DataSet/synonym.csv'
synonym_test_data = read_synonym_data(synonym_test_file)

books = [
    '/Users/briannam/Downloads/pg58866-tmotl.txt', #the murder on the links
    '/Users/briannam/Downloads/pg2199-ti.txt', #the illiad
    '/Users/briannam/Downloads/pg514-lw.txt', #little women
    '/Users/briannam/Downloads/pg161-ss.txt', #sense and sensibility
    '/Users/briannam/Downloads/pg1399-ak.txt', #ana karenina
    '/Users/briannam/Downloads/pg4078-tpodg.txt' #the picture of dorian gray
]

all_sentences = []
#download and preprocess books
for book in books:
    book_content = read_book_from_file(book)  # Update this function accordingly
    if book_content:
        tokenized_sentences = preprocess_text(book_content)
        all_sentences.extend(tokenized_sentences)
    
# Define window sizes and embedding sizes to experiment with
window_sizes = [5, 10]  # W5 and W10
embedding_sizes = [100, 300]  # E100 and E200

analysis_results = []

In [44]:
# Train Word2Vec models with different parameter combinations
for window_size in window_sizes:
    for embedding_size in embedding_sizes:
        
        our_model = Word2Vec(sentences=all_sentences, vector_size=embedding_size, window=window_size, min_count=1, workers=4)

        # Perform synonym test and get results
        results, correct_count, valid_count = process_synonym_test_data(synonym_test_data, our_model)
        
        # Save results to <model name>-details.csv
        model_name = f'word2vec_model_w{window_size}_e{embedding_size}'
        details_file = f'{model_name}-details.csv'
        write_to_csv(results, details_file)

        # Append analysis results
        analysis_results.append({
            'model_name': model_name,
            'vocab_size': len(model.wv),
            'C': correct_count,
            'V': valid_count,
            'accuracy': correct_count / valid_count if valid_count > 0 else 0
        })

with open('analysis_ownmodel.csv', 'w', newline='') as csvfile:
    csv_info = ['model_name', 'vocab_size', 'C', 'V', 'accuracy']
    writer = csv.DictWriter(csvfile, fieldnames=csv_info)
    writer.writeheader()
    for result in analysis_results:
        writer.writerow(result)