In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from collections import defaultdict
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import pprint
import os
import html2text
import re
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [18]:
data_directory = "data/"

def sanitize(line):
    stopset = set(stopwords.words('english'))
    # remove stop words
    
    line = line.strip()
    
    # remove more than one occurence of space
    line = " ".join(line.split())
    
    # remove special characters
    line = re.sub('[^A-Za-z0-9 ]+', '', line)
    return line
    

def readfile(file):
    filetype = ".htm" if file.split(".")[1] == "htm" else "txt"
    
    filedata = ""
    with open(data_directory + file) as f:
        filedata = f.read()
    
    if filetype == "htm":
        h2t = html2text.HTML2Text()
        filedata = h2t.handle(filedata)
        pageText = filedata.split("\n")
        filedata = []
    else:
        # use beautiful soup for getting the text
        soup = BeautifulSoup(filedata)
        pageText = soup.findAll(text=True)
        pageText = " ".join(pageText)
        pageText = pageText.split("\n")
        filedata = []
        
    sanitized_data = []
    # sanitize the data, remove stop words, extra spaces, 
    for l in pageText:
        tmp = sanitize(l)
        
        if tmp != "":
            sanitized_data.append(tmp)
    
    return sanitized_data

# now read all files in the directory and store them
# [[document1 array lines], [doc2], []...]
def getData():
    data = []
    files = os.listdir(data_directory)
    
    for f in files:
        data.append((f, readfile(f)))
        break
    return data

data = getData()

In [None]:
# load spacy
nlp = en_core_web_sm.load()

# for every document we store the corresponding NER
docs_ner = {}

for file in data:
    fname = file[0]
    fullText = " ".join(file[1])
    print(fullText)
    doc = nlp(fullText)
    ner = defaultdict(lambda: defaultdict(int))
    for X in doc.ents:
        ner[X.label_][X.text] += 1
    docs_ner[fname] = ner
print(docs_ner['3153_000009212203000074_ex10a116.txt'].keys())
print(docs_ner['3153_000009212203000074_ex10a116.txt']['ORG'])


In [22]:
# doc2vec similarity
d2w_data = [" ".join(e[1]) for e in data]
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(d2w_data)]

max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [23]:
# use the saved model
model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

V1_infer [ 0.03355537  0.00561451  0.00020225 -0.00433872 -0.04306964 -0.01789657
 -0.02068405 -0.01003095  0.02720794  0.04106362  0.00428242 -0.04096734
  0.03124892 -0.00489728 -0.03912873  0.00664929  0.00429899 -0.00623959
 -0.02405189 -0.04032756]


TypeError: '<' not supported between instances of 'str' and 'int'