### Build word embeddings out of header and body data

This file builds word embeddings out of the separated header/title and body/prose data. 
The vector size of title embedding 30
The vector size of body/prose embedding is 60

In [None]:
import os
import io

In [None]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')

In [None]:
from gensim.models import Word2Vec

In [None]:
stop_words = stopwords.words('english')

In [None]:
from nltk import word_tokenize
download('punkt')

In [None]:
def get_data_line_by_line(filename):
    """
    Reads the passed file line by line and adds all the words in it
    to an array and returns the array
    """
    data=[]
    with open(filename, "r", encoding='UTF-8', errors='ignore') as f:
        for line in f:
            #line_split = line.split()
            data.append(line)

    return data

In [None]:
def preprocess(doc):
    """ preprocess raw text by tokenising and removing stop-words,special-charaters """
    doc = doc.lower()  # Lower the text.
    doc = word_tokenize(doc)  # Split into words.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    return doc

In [None]:
def train_word2vec_header(train_data,model_name='header_word_embedding',worker_no=3, vector_size=30):
    """ Trains a word2vec model on the preprocessed data and saves it . """
    if not train_data:
        print ("no training data")
        return
    w2v_corpus = [preprocess(train_data[i]) for i in range(len(train_data))]
    model = Word2Vec(w2v_corpus, workers = worker_no, size=vector_size, hs=1, negative=0, window=1, min_count=10, alpha=0.0001)
    model.save(model_name)
    print ("Header Model Created Successfully")

In [None]:
def train_word2vec_body(train_data,model_name='body_word_embedding',worker_no=3, vector_size=60):
    """ Trains a word2vec model on the preprocessed data and saves it . """
    if not train_data:
        print ("no training data")
        return
    w2v_corpus = [preprocess(train_data[i]) for i in range(len(train_data))]
    model = Word2Vec(w2v_corpus, workers = worker_no, size=vector_size, hs=1, negative=0, window=5, min_count=20, alpha=0.0001)
    model.save(model_name)
    print ("Body Model Created Successfully")

In [None]:
header_corpus = r"Pass the Headers/Title Corpus here"
train_data = get_data_line_by_line(header_corpus)
train_word2vec_header(train_data=train_data)

In [None]:
body_corpus = r"Pass the Body/Prose Corpus here"
train_data = get_data_line_by_line(body_corpus)
train_word2vec_body(train_data=train_data)