# Imports

In [742]:
import json
import gensim.downloader as api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [743]:
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
stopw = set(stopwords.words('english'))

def clean(text):
    text = text.lower()
    temp = ""
    for i in text.split():
        try:
            temp+=contraction[i]+' '
        except:
            temp+= i+' '
    text = temp.strip()
    text = text.lower().translate(remove_punctuation_map)
    text = re.sub("[^a-zA-Z#]"," ",text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r",", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"!", "!", text)
    text = re.sub(r"\/", "", text)
    text = re.sub(r"'", "", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", ":", text)
    text = re.sub(r' +',' ',text)
    return text.strip()

def stopwordremoval(text):
    text = word_tokenize(text)
    text = [i for i in text if i not in stopw]
    return " ".join(text)

In [774]:
text = """Machine learning is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult to develop a conventional algorithm for effectively performing the task."""
alt = """ Machine learning is the  study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult to develop a conventional algorithm for effectively performing the task."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]

In [745]:
import gensim, smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            print(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
                
def read_corpus_semeval(tokens_only=False):
    i = 0
    doc = api.load("semeval-2016-2017-task3-subtaskA-unannotated")
    for dictionary in doc:
        sentList = []
        for com in dictionary["RelComments"]:
            sentList.append(word_tokenize(clean(com["RelCText"])))
        sentList.append(word_tokenize(clean(dictionary["RelQuestion"]["RelQBody"])))
        for sent in sentList:
            if tokens_only:
                yield sent
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(sent, [i])
                i += 1

def read_fakenews(tokens_only=False):
    doc = api.load("fake-news")
    i = 0
    for line in doc: 
        dictionary = eval(json.dumps(line))
        q = word_tokenize(clean(dictionary["title"]))
        t = [word_tokenize(clean(i)) for i in sent_tokenize(dictionary["text"])]
        t.append(q)
        for sent in t:
            if tokens_only:
                yield sent
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(sent, [i])
                i += 1

In [746]:
tc1 = read_corpus_semeval()

In [747]:
try:
    model = Doc2Vec.load("dtv_semeval")   
except:
    model = Doc2Vec(vector_size=300, workers=8, epochs=10)
    model.build_vocab(tc1)
    model.train(tc1, total_examples=model.corpus_count, epochs=model.epochs)

In [748]:
model.save("dtv_semeval")

In [749]:
# try:
#     model = Doc2Vec.load("dtv_semeval_text8")
# except:
#     count = len(list(tc2))
#     print(count)
#     tc2 = read_corpus(api.load("text8", return_path=True))
#     model.build_vocab(tc2, update=True)
#     model.train(tc2, total_examples=len(list(tc2)), epochs=model.epochs)

In [750]:
# tc2 = read_corpus(api.load("text8", return_path=True))

In [751]:
# model.save("dtv_semeval_text8")

In [752]:
try:
    model = Doc2Vec.load("dtv_semeval_fn")
except:
    tc3 = read_fakenews()
    model.build_vocab(tc3, update=True)
    model.train(tc3, total_examples=len(list(tc3)), epochs=model.epochs)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [753]:
model.save("dtv_semeval_fn")

In [775]:
model.wv.n_similarity(alt2, text2)

0.9598763