In [1]:
import numpy
import spacy
import gensim
from gensim.parsing.preprocessing import preprocess_documents
from gensim.models import Word2Vec
import os
import io
import unicodedata
import multiprocessing
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
def get_data(dirname):
    """ Collects all files in the given folder, extract each line of every file as an element, concatenate the results  """
    if not os.listdir(dirname):
        print("Files not found-Empty Directory ")
        return
    else:
        files = os.listdir(dirname) # file names i.e : cv file , job offer file
    filenames = [dirname+"/"+files[i] for i in range(len(files))] # put file paths in a list
    #train_data = [io.open(filenames[i], 'r', encoding='latin-1').read() for i in range(len(filenames))]
    train_data=[]
    for file in filenames:
        with open(file,errors='ignore') as f:
            line_list=[]
            for l in f:
                line = l.rstrip('\n')
                line_uni = unicodedata.normalize("NFKD",line)
                line_list.append(line_uni)
        #lines = [line.rstrip('\n') for line in open(file)]
        train_data= train_data + line_list
    return train_data

def _keep_token(t):
    return (t.is_alpha and 
            not (t.is_space or t.is_punct or 
                 t.is_stop or t.like_num))
def _lemmatize_doc(doc):
    return [ t.lemma_ for t in doc if _keep_token(t)]

def spacy_preproc(document):
    """ preprocess the documents using spacy """
    process_result=[]
    for doc in document:
         process_result.append(_lemmatize_doc(doc))
    return process_result

def gensim_preproc(document):
    """ preprocess the documents using gensim """
    proc_result=[]
    for c in document :
        doc=gensim.utils.simple_preprocess(c)
        proc_result.append(doc)
    return proc_result

def train_word2vec(train_data,model_name="word2vec_model"):
    """ Trains a word2vec model on the preprocessed data and saves it . """
    if not train_data:
        print("no training data")
        return
    #nlp = spacy.load('en')
    #train_data = [nlp(doc) for doc in train_data]
    print('data is ready for processing')
    w2v_corpus = gensim_preproc(train_data)
    cores = multiprocessing.cpu_count()
    model = Word2Vec(w2v_corpus, workers = cores-1,iter=30, size=150, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20)
    model.save(model_name)
    print("Model Created Successfully")

if __name__ == "__main__":
    train_data = get_data('./data/training_data')
    print('data has been read successfully')
    train_word2vec(train_data)

data has been read successfully


In [2]:
model = Word2Vec.load('word2vec_model')

INFO - 10:26:43: loading Word2Vec object from word2vec_model
INFO - 10:26:45: loading wv recursively from word2vec_model.wv.* with mmap=None
INFO - 10:26:45: setting ignored attribute vectors_norm to None
INFO - 10:26:45: loading vocabulary recursively from word2vec_model.vocabulary.* with mmap=None
INFO - 10:26:45: loading trainables recursively from word2vec_model.trainables.* with mmap=None
INFO - 10:26:45: setting ignored attribute cum_table to None
INFO - 10:26:45: loaded word2vec_model


In [5]:
model.most_similar('experience')

  """Entry point for launching an IPython kernel.


[('knowledge', 0.6328538656234741),
 ('experienced', 0.6193768382072449),
 ('hands', 0.5870240926742554),
 ('gained', 0.5824126601219177),
 ('proficiency', 0.5733476281166077),
 ('familiarity', 0.5555065274238586),
 ('exposure', 0.5405657291412354),
 ('extensive', 0.5401183366775513),
 ('expertise', 0.5255299806594849),
 ('working', 0.5080225467681885)]