In [None]:
import numpy
import gensim
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models import Word2Vec
import os
import io
import unicodedata
import multiprocessing
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
def get_data(file):
    """ Collects records from file and put them as strings in a list  """
    with open(file,errors='ignore') as f:
        line_list=[]
        for l in f:
            line = l.rstrip('\n')
            line_uni = unicodedata.normalize("NFKD",line)
            line_list.append(line_uni)
    return line_list

def gensim_preproc(document):
    """ preprocess the documents using gensim """
    proc_result=[]
    for c in document :
        doc=gensim.utils.simple_preprocess(c)
        proc_result.append(doc)
    return proc_result

def train_word2vec(train_data,model_name="word2vec_model_optim"):
    """ Trains a word2vec model on the preprocessed data and saves it . """
    if not train_data:
        print("no training data")
        return
    print('data is ready for processing')
    w2v_corpus = gensim_preproc(train_data)
    cores = multiprocessing.cpu_count()
    model = Word2Vec(w2v_corpus, workers = cores-1,iter=30, size=150, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20)
    model.save(model_name)
    print("Model Created Successfully")

if __name__ == "__main__":
    train_data = get_data('./data/training_data/Resume_data')
    print('data has been read successfully')
    train_word2vec(train_data)