In [111]:
import os 
import json
import gensim 
import pandas as pd
import multiprocessing 


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import string
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import tokenize

from __future__ import print_function

STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation.split(" "))

In [35]:
def argmax(iterable):
    return max(enumerate(iterable), key=lambda x: x[1])[0]

# Data Filtering

In [36]:
with open("config.json") as json_file: 
    config = json.load(json_file)

In [37]:
def fetch_terms(item): 
    # Get term list and the original
    terms = item["name"]
    
    # Get the first 3
    terms = terms.split(" ")[:3]
        
    return {
        "name": item["name"], 
        "terms": [terms[2]], 
        "keywords": item["keywords"]
    }

In [38]:
def filter_corpus(config): 
 
    search_terms = list(map(fetch_terms, config))
    
    result_file = open("sanity_check_filtering.csv", "w")
    
    queries = [] 
    
    for query in search_terms: 
        filename = "CORPUS/%s.json" % (query["name"])
        
        with open(filename) as f: 
            corpus_list = json.load(f)

        total_length = len(corpus_list)

        keyword_relevant = 0

        filtered_corpus_list = [] 
        
        for corpus in corpus_list: 
            text = corpus["body"]
            tokens = text.split(" ")
                        
            token_set = set([tok.lower() for tok in tokens])
            keyword_set = set([tok.lower() for tok in query["keywords"]])

            if (token_set.intersection(keyword_set)): 
                keyword_relevant += 1
                filtered_corpus_list.append(corpus)
        
        queries.append({
            "query": query["name"], 
            "corpus": filtered_corpus_list, 
        })
        
        print("%s,%d,%d,%f" % (query["name"], keyword_relevant, total_length, keyword_relevant/total_length), 
                              file=result_file)

    result_file.close()
    # Proceed pipeline with the filtered corpus list        
    return queries

# Topic Modeling

In [39]:
def clean_text(article): 
    article = article.strip()
    
    # Remove newlines
    article = article.replace("\n", " ")
    article = article.replace("\r", " ")
      
    # Remove HTML symbols if any 
    article = article.replace("&amp;", "and")
    article = article.replace("&gt;", ">")
    article = article.replace("&lt;", "<")
    
    article = article.lower()
    return article 

def tokenize(article): 
    tokens = article.split()
            
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    
    return tokens

In [40]:
def fetch_topics_for_query(name, article_list):    
    documents = map(lambda doc: clean_text(doc["body"]), article_list)
    texts = [tokenize(document) for document in documents]
    
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, 
                                              num_topics=50,
                                              id2word=dictionary,
                                              passes=1, 
                                              workers=3)
    
    topics = ldaModel.show_topics(num_topics=10, log=True, formatted=True)

    file = open("%s_topics.txt" % name, 'w')
    print(topics, file=file)
    file.close()     

In [41]:
def fetch_topics_for_queries(query_list):
    df = pd.DataFrame(columns=[])
    for query in query_list: 
        corpus = query["corpus"]
        fetch_topics_for_query(query["query"], corpus)

# Date Binning 

In [85]:
BASE = 1996 # Max range of 22 years
RANGE = 22

In [83]:
def bin_dates_for_query(name, article_list): 

    years = {}
    
    for x in range(0, RANGE): 
        years[x + BASE] = 0 
    
    for article in article_list: 
        year = article["date"]
        year = year[:4] # Hack - get the first 4 digits, year
        year = int(year) 
        years[year] += 1
        
    
    result = []
    for x in range(BASE, BASE+RANGE): 
        result.append(years[x])
    
    return result

In [94]:
def bin_dates(query_list):
    cols = ["Name"] + [x + BASE for x in range(0, RANGE)]
    df = pd.DataFrame()
    for query in query_list: 
        name = query["query"]
        articles = query["corpus"]
        histogram = bin_dates_for_query(name, articles)
        df = df.append([[name] + histogram])
    df.columns = cols
    df.to_csv("date_binning.csv")
    return df 

# Sentiment Analysis 

In [112]:
def get_sentiment_of_query(args):
    
    name = args[0]
    articles_list = args[1]
    
    sid = SentimentIntensityAnalyzer() 
    length = len(articles_list)
    comp, pos, neu, neg = 0.0, 0.0, 0.0, 0.0
    print("Processing", name)
    for article in articles_list: 
        text = article["body"]
        polarity = sid.polarity_scores(text) 
        comp += polarity['compound']
        pos += polarity['pos']
        neu += polarity['neu']
        neg += polarity['neg']
    
    return [name, comp/length, pos/length, neu/length, neg/length]

In [1]:
def get_sentiment(query_list):
    cols = ["Name", "Compound", "Pos", "Neu", "Neg"]
    df = pd.DataFrame()
    
    CORES = 4
    pool = multiprocessing.Pool(CORES)
    args = [(query["query"], query["corpus"]) for query in query_list]
    dfrows = pool.map(get_sentiment_of_query, args)
    pool.close() 
    # Wait for map to finish
    pool.join() 
    
    
    df = df.append(dfrows)
    df.columns = cols
    df.to_csv("category_sentiments.csv")
    return df 


# Pipeline

In [47]:
filtered_list = filter_corpus(config)

In [None]:
fetch_topics_for_queries(filtered_list)

In [96]:
data_binning_df = bin_dates(filtered_list)

In [116]:
sentiment_df = get_sentiment(filtered_list)

Processing SURVEILLANCE AND LEAKS
Processing AUSTRIA AND PRIVACY AND EITHER HNA AGENCIES
Processing BULGARIA AND PRIVACY AND EITHER AGENCIES
Processing CYPRUS AND PRIVACY AND EITHER AGENCIES
Processing AUSTRIA AND SURVEILLANCE AND EITHER AGENCIES
Processing BELGIUM AND PRIVACY AND EITHER AGENCIES
Processing CYPRUS AND SURVEILLANCE AND EITHER AGENCIES
Processing CZECH REPUBLIC AND PRIVACY AND EITHER AGENCIES
Processing BELGIUM AND SURVEILLANCE AND EITHER AGENCIES
Processing DENMARK AND PRIVACY AND EITHER AGENCIES
Processing DENMARK AND SURVEILLANCE AND EITHER AGENCIES
Processing ESTONIA AND PRIVACY AND EITHER AGENCIES
Processing CZECH REPUBLIC AND SURVEILLANCE AND EITHER AGENCIES
Processing BULGARIA AND SURVEILLANCE AND EITHER AGENCIES
Processing CROATIA AND PRIVACY AND EITHER AGENCIES
Processing FINLAND AND PRIVACY AND EITHER AGENCIES
Processing ESTONIA AND SURVEILLANCE AND EITHER AGENCIES
Processing FINLAND AND SURVEILLANCE AND EITHER AGENCIES
Processing FRANCE AND PRIVACY AND EITHER 