## This code is used to:
Fine-tune word2vec model on the above preprocessed dataset 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt 
from os import listdir
import os
import pickle
from tqdm import tqdm_notebook

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import gensim
from gensim.models import KeyedVectors

In [3]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from spacy.pipeline import merge_entities
nlp = en_core_web_sm.load()
nlp.add_pipe(merge_entities)

In [4]:
corpus_path = "../data/lm/"
economic_path = "../data/lm/economic/"
tech_path = "../data/lm/tech/"

all_economic_articles = os.path.join(corpus_path, "all_economic_articles.txt") 
# pre-processed text with named entities replaced by their tags (person, organisation only)
all_economic_articles_ner = os.path.join(corpus_path, "all_economic_articles.txt-ner.txt") 
# pre-processed text with named entities removed (person, organisation only)
all_economic_articles_blackout = os.path.join(corpus_path, "all_economic_articles.txt-blackout.txt")

all_tech_articles = os.path.join(corpus_path, "all_tech_articles.txt")
all_tech_articles_ner = os.path.join(corpus_path, "all_tech_articles.txt-ner.txt")
all_tech_articles_blackout = os.path.join(corpus_path, "all_tech_articles.txt-blackout.txt")

In [5]:
lock_factor=0.9
window_size = 3
num_neg_samples = 10
num_epochs = 5
learning_rate = 0.001
min_count=1
pretrained_embeddings_path = '/MTP/finetune-word2vec/w2v-models/GoogleNews-vectors-negative300.bin'

## Economic

Without NER

In [6]:
model_path = '/MTP/finetune-word2vec/w2v-models/economic-word2vec'
policy_articles_path = all_economic_articles

NER

In [7]:
model_path = '/MTP/finetune-word2vec/w2v-models/economic-word2vec-ner'
policy_articles_path = all_economic_articles_ner

Blackout

In [8]:
model_path = '/MTP/finetune-word2vec/w2v-models/economic-word2vec-blackout'
policy_articles_path = all_economic_articles_blackout

## Tech

Without NER

In [9]:
model_path = '/MTP/finetune-word2vec/w2v-models/tech-word2vec'
policy_articles_path = all_tech_articles

NER

In [10]:
model_path = '/MTP/finetune-word2vec/w2v-models/tech-word2vec-ner'
policy_articles_path = all_tech_articles_ner

Blackout

In [11]:
model_path = '/MTP/finetune-word2vec/w2v-models/tech-word2vec-blackout'
policy_articles_path = all_tech_articles_blackout

In [12]:
with open(policy_articles_path,'r') as f:
    sentences = f.readlines()

In [14]:
sents = []
for sentence in sentences:
    sents+=sent_tokenize(sentence)
sents = [word_tokenize(sent) for sent in sents]

In [15]:
with open(os.path.join(corpus_path,'sents-economic-blackout.pkl'), 'wb') as f:
    pickle.dump(sents, f)

### Word2Vec


In [16]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_embeddings_path, binary=True)

In [17]:
def combine_vocab(model, domain_vocab):
	word2vec_vocab = list(model.wv.vocab.keys())
	domain_vocab = list(domain_vocab.keys())
	combined_vocab = word2vec_vocab
	for x in domain_vocab:
	    if x not in word2vec_vocab:
	        combined_vocab.append(x)
	return combined_vocab

In [18]:
def get_domain_model(corpus, word2vec_model):
	# check size of embedding of word2vec
	embedding_dim = word2vec_model.vectors[0].shape[0]
	domain_model = gensim.models.Word2Vec(size=300, alpha=0.025,
               window=5, min_count=2, max_vocab_size=None, 
               sample=0.001, workers=4, min_alpha=0.0001, 
               sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1)
	domain_model.build_vocab(corpus)
	total_examples = domain_model.corpus_count
	domain_model.build_vocab([list(word2vec_model.vocab.keys())], update=True)
	domain_model.intersect_word2vec_format(pretrained_embeddings_path, binary=True, lockf=lock_factor)
	domain_model.train(corpus, total_examples=total_examples, epochs=1)
	return domain_model

In [20]:
domain_model = get_domain_model(sents, word2vec_model)

In [21]:
domain_model.wv.save(model_path) # save fine-tuned model

In [22]:
dm = KeyedVectors.load('/MTP/w2v-models/all_w2v') # load model