In [None]:
from google.colab import drive # import drive from google colab
from os.path import join  

In [None]:
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

In [None]:
%cd '/content/drive'
directory_name = !find . -type d -name "ucl-nlp-finalproject"
directory_name = directory_name[0]
print(directory_name)

In [None]:
%cd $directory_name

In [None]:
import sys
sys.path.append('..')
import numpy as np
import pickle
import waku
import time
from datetime import datetime
import re
from os import path

from waku.tasks.sentiment_analysis.sentiment_analysis_downstream import SST, Extrinsic_Sentiment_Analysis
from waku.tasks.word_intrusion.word_intrusion import top_ten_set, dist_ratio
from waku.tasks.word_similarity.word_sim import loadSimilarlityData, evaluate

In [None]:
# read pickle file into dictionary
def load_pickle(filepath):
    # file path must end with .pickle
    pickle_in = open(filepath,"rb")
    emb_dict = pickle.load(pickle_in)
    return emb_dict

# load using helper function
dict_path = '/content/drive/My Drive/UCL_ML/NLP Class/word2vec_data/word2index.pickle'
embedding_dict = load_pickle(dict_path)


# load similarlity pairs
annotated_pairs_path = '/content/drive/My Drive/UCL_ML/NLP Class/word_sim_val/human_sim.txt'
pairs_data = loadSimilarlityData(annotated_pairs_path)

# folder path for saving experiments
EXPERIMENT_FOLDER_PATH = '/content/drive/My Drive/UCL_ML/NLP Class/kush_results'

Sentiment analysis experiment:

In [None]:
def sentiment_analysis(embedding_dict, embedding_weights)
    # Instantiate a *SST* to load Stanford Sentiment Treebank train/test/val data
    SST_instance = SST()

    # Load an instance of the *Extrinsic_Sentiment_Analysis* class with a given dictionary and 
    # weights at which point we reduce the vocabulary into words present in SST
    Experiment = Extrinsic_Sentiment_Analysis(SST_instance, embedding_dict, embedding_weights)

    # Train a specified LSTM model for a given number of epochs using the *train* function
    Experiment.train(epochs=50, learning_rate=0.001, batch_size=512, hidden_size=300, rnn_layers=2, mlp_layer_widths=100)

    # Calculate accuracy on the test set and save
    Experiment.test(print_accuracies=True, save_test_acc=True, file_path=)
    
    # reset data class
    SST_instance.reset()
    
    return Experiment.bestAccuracy["test set"]

Word intrusion experiment:

In [None]:
def word_intrusion_experiment(embedding_weights):
    top_ten = top_ten_set(embedding_weights)
    results = dist_ratio(embedding_weights, top_ten, 5, 10, print_result=True, save_acc=True, file_path=EXPERIMENT_FOLDER_PATH)
    
    return results['mean']

Word similarlity experiment:

In [None]:
percents = [10, 30, 50, 70, 90]
scores = np.array((5,7))
for i in range(len(percents)):
    np.array[i,0] = percents[i]
    
    weights_filepath = '/content/drive/My Drive/UCL_ML/NLP Class/word2vec_data/300'+'hs_300_'+str(percents[i])+'percent_embeddings.npz'
    embedding_weights = np.load(weights_filepath)['a']
    
    # sentiment analysis
    tic = time.perf_counter()
    sa_score = sentiment_analysis(embedding_dict, embedding_weights)
    toc = time.perf_counter()
    scores[i,1] = sa_score
    scores[i,2] = toc-tic
    
    # intrusion
    tic = time.perf_counter()
    intrusion_score = word_intrusion_experiment(embedding_weights)
    toc = time.perf_counter()
    scores[i,3] = intrusion_score
    scores[i,4] = toc-tic
    
    # similarlity
    tic = time.perf_counter()
    similarlity_score, _ = evaluate(dict_path, weights_filepath, pairs_data, 'test', verbose=False)
    toc = time.perf_counter()
    scores[i,5] = similarlity_score
    scores[i,6] = toc-tic
    
np.save(EXPERIMENT_FOLDER_PATH+'hoyer_300_sparse_scores', scores)