## Setup

We recommend that this notebook is run using a GPU where possible. The following three cells mount the Google Drive folder in order to access the relevant data sources. 

In [0]:
from google.colab import drive # import drive from google colab

In [0]:
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT, force_remount=True)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [0]:
# This is necessary to ensure that paths are correct for importing data from the google drive folder
# Feel free to edit this to the actual path that our WAKU submission folder lives

%cd "/content/drive"
WAKU_DIR = !find '/content/drive' -type d -iname "team13_waku_submission" -print -quit
WAKU_DIR = WAKU_DIR[0]
print(WAKU_DIR)

/content/drive


IndexError: ignored

In [0]:
%cd $WAKU_DIR

In [0]:
import numpy as np
import pickle
import torch
torch.manual_seed(100)
torch.cuda.manual_seed(100)
np.random.seed(100)

from waku.word_analogies.base import get_wordanalogy_scores
from waku.word_intrusion.base import evaluate as word_intrusion_experiment
from waku.word_similarity.base import load_sim_data, evaluate as ws_evaluate
from waku.sentiment_analysis.trainer import evaluate as sa_evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device used: ", device)

## Experiment Code

In [0]:
def sentiment_analysis_experiment(embedding_dict, embedding_weights):
    checkpoint_filepath = 'checkpoint.pth.tar'
    SST_filepath = WAKU_DIR + '/raw_data/'
    
    test_accuracy = sa_evaluate(embedding_weights, SST_filepath, embedding_dict, checkpoint_filepath)
    
    return test_accuracy

In [0]:
def word_similarity_experiment(embedding_dict, embedding_weights):
    sim_data_path = WAKU_DIR + '/raw_data/SimLex999.txt'
    sim_data = load_sim_data(sim_data_path, 'SimLex999')
    sim_score, _ = ws_evaluate(embedding_weights, sim_data, embedding_dict, verbose=False)
    
    return sim_score

In [0]:
def word_analogy_experiment(embedding_dict, embedding_weights):
    path_to_questionwords_dataset = WAKU_DIR + '/raw_data/questions-words.txt'
    scores, total_correct, total_found, total_accuracy, total_time = get_wordanalogy_scores(path_to_questionwords_dataset, embedding_dict, embedding_weights, True, verbose = False) # False means we have embeddings for uppercased words, True means embeddings for lowercase
    
    return total_accuracy

In [0]:
def run_tasks(embedding_path, embedding_dict):
    # load embeddings matrix
    embedding_weights = np.load(embedding_path)['a']
      
    # intrusion
    intrusion_score = word_intrusion_experiment(embedding_weights, k=5, N=10, acc_filepath=None, verbose=False)
    print("Intrusion Score: ", intrusion_score)

    # sentiment analysis
    sa_score = sentiment_analysis_experiment(embedding_dict, embedding_weights)
    print("Sentiment Score: ", sa_score)

    # similarlity
    sim_score = word_similarity_experiment(embedding_dict, embedding_weights)
    print("Similarity Score: ", sim_score)

    # word analogy 
    wa_score = word_analogy_experiment(embedding_dict, embedding_weights)
    print("Analogy score: ", wa_score)

    return 

In [0]:
# read pickle file into dictionary
def load_pickle(filepath):
    # file path must end with .pickle
    pickle_in = open(filepath,"rb")
    emb_dict = pickle.load(pickle_in)
    return emb_dict

# load word2index dict using helper function
dict_path = WAKU_DIR  + '/embeddings/word2index.pickle'
word2index = load_pickle(dict_path)

## Run Experiments

We evaluate our Hoyer-Square $\lambda=10$, $90$% sparsity embeddings. Here the values differ slightly from the results presented in the report. This is because of the steps taken in the random seed, where previously we evaluated the models sequentially.

In [0]:
# embedding path
hoyer_10_embedding_path = WAKU_DIR + '/embeddings/hoyer_10.0/300_10.0_90_embeddings.npz'

In [0]:
# run tasks for Hoyer-Square embeddings with lambda=10 at 90% sparsity
run_tasks(hoyer_10_embedding_path, word2index)