In [7]:
import subprocess
import os
import sys
import codecs
import json
import importlib

import google_article_search
import newsExtractor
import processDoc
import wikitags

In [8]:
def runSub(pathToStanCoreNLP):
    currDir = os.getcwd()
    os.chdir(pathToStanCoreNLP)
    command = "java -Xmx5g -cp '*' edu.stanford.nlp.pipeline.StanfordCoreNLP "\
            + "-annotators tokenize,ssplit,pos,lemma,ner,parse,mention,coref -coref.algorithm neural -file out.txt -outputFormat json"
    subprocess.check_output(command, shell=True)
    os.chdir(currDir)

In [9]:
def putTextInFile(article, pathToStanCoreNLP):
    currDir = os.getcwd()
    os.chdir(pathToStanCoreNLP)
    try:
        with codecs.open("out.txt", "w", "ascii", errors="ignore") as outFile:
            outFile.write(article)
    except:
        os.chdir(currDir)
        print (sys.exc_info())
        return False
    os.chdir(currDir)
    return True

In [10]:
numPosts = 20
numComments = 200
metaFileName = "data/metadata.txt"
commentFilenames = ["comment" + str(i) + ".txt" for i in range(1,numPosts + 1)]

googleApiKey = "AIzaSyAvnSZCKCeHSZCWVNfQMLvq5XJiOMYYa88"
googleCseID = "006733671097832492705:vknyjefr9aa"
numWebDocs = 10

pathToStanCoreNLP = "/Users/bhvjain/Desktop/work/stanford-corenlp-full-2017-06-09"

In [11]:
metafile = open(metaFileName, "r")

In [None]:
importlib.reload(newsExtractor)
importlib.reload(google_article_search)
importlib.reload(processDoc)
importlib.reload(wikitags)

from newsExtractor import extractArticle
from google_article_search import google_search
from processDoc import replaceCorefs, cleanDoc
from wikitags import getAnchorTags

# format of each line in metadata => sr. no, link, title, facebook person tags
for currPostNum in range(1, numPosts + 1):
    metadata = metafile.readline().split(";;")
    articleLink = metadata[1]
    articleTitle = metadata[2]
    properNouns = metadata[3].split()
    try:
        articleContent = extractArticle(articleLink)
    except Exception as e:
        print (str(e))
    webArticleContent = {}
    for i, link in enumerate(google_search(articleTitle, googleApiKey, googleCseID, num=numWebDocs)):
        try:
            webArticleContent[i + 1] = extractArticle(link)
        except Exception as e:
            continue

    articleData = {}
    
    if putTextInFile(articleContent, pathToStanCoreNLP):
        runSub(pathToStanCoreNLP)
        NLPppn, doc = replaceCorefs("out.txt.json", pathToStanCoreNLP)

        properNouns = list(set(properNouns + NLPppn))
        try:
            wikiProperNoun = getAnchorTags(properNouns)
        except Exception as e:
            # Disambiguation error, timeout error
            wikiProperNoun = []
        properNouns = list(set(properNouns + wikiProperNoun))
        articleData[0] = [properNouns, doc]

        for idx, content in webArticleContent.items():
            # print (idx) -- for debugging purposes
            if putTextInFile(content, pathToStanCoreNLP):
                runSub(pathToStanCoreNLP)
                NLPppn, doc = replaceCorefs("out.txt.json", pathToStanCoreNLP)
                try:
                    wikiProperNoun = getAnchorTags(NLPppn)
                except:
                    wikiProperNoun = []
                properNouns = list(set(NLPppn + wikiProperNoun))
                articleData[idx] = [properNouns, doc]
    else:
        print ("Failed retrieving main article # %d" %currPostNum)
    break

In [270]:
# Process only the text of the article, add ppn list later
articleTexts = {i : articleData[i][1].lower() for i in articleData}

# Text Preprocessing Pipeline

In [359]:
import os, gensim, itertools
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [360]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', '?', '%', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

In [361]:
""" Here we define functions to find out collocations from the data. Use TextBlob to find out noun phrases"""

from textblob import TextBlob

def head(stream, n=10):
    """
    Convenience func: return the first `n` elements of the stream, as plain list.
    """
    return list(itertools.islice(stream, n))

def best_phrases(document_stream, top_percent = 10):
    """
    Return a set of `top_percent` most common noun phrases.
    @Params:
    document_stream: Dictionary containing (docid, raw doc)
    @Returns:
    collocDict: Dictionary containing (space separated np, underscore separated np) 
                For example, (north korea, north_korea)
    """
    np_counts = {}
    for docno, doc in document_stream.items():
        for np in TextBlob(doc).noun_phrases:
            # If it is just a one word noun phrase, ignore
            if u' ' not in np:
                continue
            # On splitting does it yield words with only alphabetical characters and each of length greater than 2?
            if all(word.isalpha() and len(word) > 2 for word in np.split()):
                np_counts[np] = np_counts.get(np, 0) + 1
                
    sorted_phrases = sorted(np_counts, key=lambda np: -np_counts[np])
    top_n = int(len(set(sorted_phrases)) * top_percent / 100.0)
    entities = set(head(sorted_phrases, top_n))
    global stop_words
    # Further prune the collocations set and populate mappings between space separated np and underscore separated np.
    collocDict = {}
    for entity in entities:
        token = u'_'.join(part for part in gensim.utils.tokenize(entity) if len(part) > 2)
        if len(token) < 4 or token in stop_words:
            continue
        collocDict[entity] = token

    return collocDict

In [362]:
class preprocessedData(object):
    def __init__(self, data_dict):
        self.data_dict = data_dict
        self.entities = best_phrases(self.data_dict)
        
    def __iter__(self):
        for _, item in self.data_dict.items():
            yield(self.process(item))
            
    def process(self, doc):      
        for x, y in self.entities.items():
            doc = doc.replace(x, y)
            
        ans = []
        lemmatizer = WordNetLemmatizer()
        global stop_words
        
        sents = sent_tokenize(doc)
        for sent in sents:
            tmp = []
            words = word_tokenize(sent)
            for word in words:
                tmp.append(lemmatizer.lemmatize(word))
            ans.append(tmp)
        ans = [x for sublist in ans for x in sublist] # Flatten the list of lists
        ans = [token for token in ans if token not in stop_words] # Remove stopwords
        ans = [token for token in ans if len(token) > 3] # Any token with length less or equal to 3 goes away 
        return ans  

In [363]:
# doc_stream: generator of token-list corresponding to each document
doc_stream = ([tokens for tokens in PreprocessData(articleTexts)]) 

# corpus dictionary
id2word = gensim.corpora.Dictionary(doc_stream)

In [364]:
class VectorizedCorpus(object):
    def __init__(self, data_dict, dictionary):
        self.data_dict = data_dict
        self.dictionary = dictionary
    
    def __iter__(self):
        for tokens in PreprocessData(self.data_dict):
            yield self.dictionary.doc2bow(tokens)
    
# create a stream of bag-of-words vectors
vect_corpus = VectorizedCorpus(articleTexts, id2word)

In [365]:
lda = gensim.models.ldamodel.LdaModel(vect_corpus, num_topics = 5)

In [366]:
from gensim.utils import simple_preprocess
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in stop_words]

In [367]:
post = articleTexts[0]
post_bow = id2word.doc2bow(tokenize(post))
post_lda = lda[post_bow]

# dense1 = gensim.matutils.sparse2full(post_lda, lda.num_topics)

In [368]:
comments = []
like_count = []
reply_count = []
with open('data/comment1.txt', 'r') as f:
    for line in f:
        splittedLine = line.split(';;')
        comments.append(splittedLine[0])
        like_count.append(int(splittedLine[1]))
        reply_count.append(int(splittedLine[2]))

In [369]:
import numpy as np

results = []
for comment in comments:
    if len(comment) < 10:
        results.append(0)
        continue
    comm_bow = id2word.doc2bow(tokenize(comment))
    comm_lda = lda[comm_bow]
    # dense2 = gensim.matutils.sparse2full(comm_lda, lda.num_topics)
    sim = gensim.matutils.cossim(post_lda, comm_lda)
    if (sim > 0.5):
        results.append(1)
    else:
        results.append(0)

In [370]:
groundTruth = [int(l) for l in open('data/groundTruth1.txt', 'r')]

In [371]:
acc = 0
for i, j in zip(results, groundTruth):
    if i == j:
        acc += 1

In [372]:
print (acc/2)

70.5
