In [11]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim.downloader
import numpy as np
import string
import os
import pandas as pd

In [12]:
def get_sentence_vector(sentence, model):
    # Preprocess the sentence
    sentence = sentence.lower()
    words = sentence.split()
    # Remove punctuation
    words = [word.strip(string.punctuation) for word in words]
    vector = np.zeros(len(model[0]))
    count = 0
    for word in words:
        if word in model:
            vector += model[word]
            count += 1
    # Return the average of the vectors
    return vector / count

In [13]:
def build_all_vectors(train, dev, test, model):
    # Build the vectors for the training set
    train_vectors = np.zeros((len(train), len(model[0])))
    for i, sentence in enumerate(train):
        train_vectors[i] = get_sentence_vector(sentence, model)
    # Build the vectors for the dev set
    dev_vectors = np.zeros((len(dev), len(model[0])))
    for i, sentence in enumerate(dev):
        dev_vectors[i] = get_sentence_vector(sentence, model)
    # Build the vectors for the test set
    test_vectors = np.zeros((len(test), len(model[0])))
    for i, sentence in enumerate(test):
        test_vectors[i] = get_sentence_vector(sentence, model)
    return train_vectors, dev_vectors, test_vectors

In [14]:
def save_vectors(train_vectors, dev_vectors, test_vectors, train_name, dev_name, test_name, save_path):
    np.save(os.path.join(save_path, train_name), train_vectors)
    np.save(os.path.join(save_path, dev_name), dev_vectors)
    np.save(os.path.join(save_path, test_name), test_vectors)

## Loading Data

In [15]:
parent_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
folder_path = 'Data\PreprocessedData'
net_path = os.path.join(parent_dir, folder_path)

In [16]:
df_train_english = pd.read_csv(os.path.join(net_path, 'train_preprocessed.csv'))
df_dev_english = pd.read_csv(os.path.join(net_path, 'val_preprocessed.csv'))
df_test_english = pd.read_csv(os.path.join(net_path, 'test_preprocessed.csv'))

In [17]:
train_english_sentences = df_train_english['preprocessed_text'].tolist()
dev_english_sentences = df_dev_english['preprocessed_text'].tolist()
test_english_sentences = df_test_english['preprocessed_text'].tolist()

### glove-twitter-25

In [18]:
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [19]:
gt_25_train_english, gt_25_dev_english, gt_25_test_english = build_all_vectors(train_english_sentences, dev_english_sentences, test_english_sentences, glove_vectors)

  return vector / count


In [20]:
gt25_save_path = os.path.join(parent_dir, 'WordEmbeddings\PreComputedWordEmbeddings')

# Save the vectors
save_vectors(gt_25_train_english, gt_25_dev_english, gt_25_test_english, 'gt_25_train_english.npy', 'gt_25_dev_english.npy', 'gt_25_test_english.npy', gt25_save_path)

### fasttext-wiki-news-subwords-300

In [21]:
fasttext_300_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [22]:
ft_300_train_english, ft_300_dev_enlish, ft_300_test_english = build_all_vectors(train_english_sentences, dev_english_sentences, test_english_sentences, fasttext_300_vectors)

In [23]:
ft_300_save_path = os.path.join(parent_dir, 'WordEmbeddings\PreComputedWordEmbeddings')

# Save the vectors
save_vectors(ft_300_train_english, ft_300_dev_enlish, ft_300_test_english, 'ft_300_train_english.npy', 'ft_300_dev_english.npy', 'ft_300_test_english.npy', ft_300_save_path)

### word2vec-google-news-300

In [24]:
w2v_300 = gensim.downloader.load('word2vec-google-news-300')

In [25]:
w2v_300_train_english, w2v_300_dev_english, w2v_300_test_english = build_all_vectors(train_english_sentences, dev_english_sentences, test_english_sentences, w2v_300)

  return vector / count


In [26]:
w2v_save_path = os.path.join(parent_dir, 'WordEmbeddings\PreComputedWordEmbeddings')

# Save the vectors
save_vectors(w2v_300_train_english, w2v_300_dev_english, w2v_300_test_english, 'w2v_300_train_english.npy', 'w2v_300_dev_english.npy', 'w2v_300_test_english.npy', w2v_save_path)