# Import Dependencies

In [1]:
import os
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
import pandas as pd
import smart_open
import numpy as np



In [2]:
#download('stopwords')

# Load Data

In [3]:
os.chdir(r'C:\Users\aidan\OneDrive - University of Bath\1_Semester_2\Cm50175_dissertation_preparation\Data\consolidate\final_data')

In [4]:
abstracts_df = pd.read_csv('abstracts.csv')

before = abstracts_df.shape[0]
abstracts_df.dropna(subset=['description'], inplace=True)
abstracts_df.reset_index(drop=True, inplace=True)
after = abstracts_df.shape[0]

print(f'Dropped {before - after} rows due to missing abstracts.')

Dropped 60267 rows due to missing abstracts.


# Process Data

In [5]:
stop_words = stopwords.words('english')

In [6]:
def read_corpus(df, tokens_only=False):
    for i, line in df.iterrows():
        tokens = gensim.utils.simple_preprocess(line['description'])
        tokens = [token for token in tokens if token not in stop_words]
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [7]:
train_corpus = list(read_corpus(abstracts_df))

In [8]:
len(train_corpus)

767968

# Train Model

In [9]:
vector_size = 10000  # hyperparameter - document embedding size
alpha = 0.025        # initial learning rate
min_alpha = 0.00025  # learning rate drops to this value as training progresses
min_count = 2        # ignores words in vocab with freq < value
epochs = 10          # reasonably low for computation
workers = 2          # number of processors on laptop

In [10]:
model = Doc2Vec(vector_size=vector_size,
               alpha=alpha,
               min_alpha=min_alpha,
               min_count=min_count,
               epochs=epochs,
               workers=workers)

In [11]:
model.build_vocab(train_corpus)

MemoryError: Unable to allocate 10.6 GiB for an array with shape (285363, 10000) and data type float32

In [None]:
% time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

# Save Model

This automatically saves the document and word vectors as well.

In [None]:
fname = r'C:\Users\aidan\OneDrive - University of Bath\1_Semester_2\Cm50175_dissertation_preparation\Data\consolidate\embeddings\doc2vec'
os.chdir(fname)
model.save(f'my_d2v_{vector_size}.model')

In [None]:
loaded = np.load(fname + f'\my_d2v_{vector_size}.model' + '.dv.vectors.npy')

In [None]:
loaded.shape

In [None]:
abstracts_df.shape