In [11]:
import os 
import json
import gensim 


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import string



STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation.split(" "))



In [132]:
def argmax(iterable):
    return max(enumerate(iterable), key=lambda x: x[1])[0]

# Data Filtering

In [12]:
with open("config.json") as json_file: 
    config = json.load(json_file)

In [13]:
def fetch_terms(item): 
    # Get term list and the original
    terms = item["name"]
    
    # Get the first 3
    terms = terms.split(" ")[:3]
        
    return {
        "name": item["name"], 
        "terms": [terms[2]], 
        "keywords": item["keywords"]
    }

In [14]:
def filter_corpus(config): 
 
    search_terms = list(map(fetch_terms, config))
    
    result_file = open("sanity_check_filtering.csv", "w")
    
    queries = [] 
    
    for query in search_terms: 
        filename = "CORPUS/%s.json " % (query["name"])
        
        with open(filename) as f: 
            corpus_list = json.load(f)

        total_length = len(corpus_list)

        keyword_relevant = 0

        filtered_corpus_list = [] 
        
        for corpus in corpus_list: 
            text = corpus["body"]
            tokens = text.split(" ")
                        
            token_set = set([tok.lower() for tok in tokens])
            keyword_set = set([tok.lower() for tok in query["keywords"]])

            if (token_set.intersection(keyword_set)): 
                keyword_relevant += 1
                filtered_corpus_list.append(corpus)
        
        queries.append({
            "query": query["name"], 
            "corpus": filtered_corpus_list, 
        })
        
        print("%s,%d,%d,%f" % (query["name"], keyword_relevant, total_length, keyword_relevant/total_length), file=result_file)

    result_file.close()
    # Proceed pipeline with the filtered corpus list        
    return queries

# Topic Modeling

In [3]:
def clean_text(article): 
    article = article.strip()
    
    # Remove newlines
    article = article.replace("\n", " ")
    article = article.replace("\r", " ")
      
    # Remove HTML symbols if any 
    article = article.replace("&amp;", "and")
    article = article.replace("&gt;", ">")
    article = article.replace("&lt;", "<")
    
    article = article.lower()
    return article 

def tokenize(article): 
    tokens = article.split()
            
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    
    return tokens

In [17]:
def fetch_topics_for_query(name, article_list):    
    documents = map(lambda doc: clean_text(doc["body"]), article_list)
    texts = [tokenize(document) for document in documents]
    
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, 
                                              num_topics=50,
                                              id2word=dictionary,
                                              passes=1, 
                                              workers=3)
    
    topics = ldaModel.show_topics(num_topics=10, log=True, formatted=True)

    file = open("%s_topics.txt" % name, 'w')
    print(topics, file=file)
    file.close()     

In [5]:
def fetch_topics_for_queries(query_list):
    for query in query_list: 
        corpus = query["corpus"]
        fetch_topics_for_query(query["query"], corpus)

# Date Binning 

In [6]:
years = [x for x in range(1996, 2017)]

In [None]:
def bin_dates_for_query(name, corpus): 
    
    
    
    
    
    pass

In [None]:
def bin_dates(query_list):
    for query in query_list: 
        corpus = query["corpus"]

# Pipeline

In [15]:
filtered_list = filter_corpus(config)

In [18]:
fetch_topics_for_queries(filtered_list)