# Retrieve data from MongoDB

In [None]:
import pymongo
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.reddit_database
collection_top = db.reddit_top
collection_contro = db.reddit_contro

In [None]:
# Create list of comments from 'controversial' submissions
contro = {}
i = 1
for d in collection_contro.find():
    temp = {}
    temp['title'] = d['title']
    comments_only = [x[0] for x in d['comments']]
    temp['comments'] = comments_only
    contro[i] = temp
    i += 1

# Create list of comments from 'top' submissions
top = {}
j = 1
for d in collection_top.find():
    temp = {}
    temp['title'] = d['title']
    comments_only = [x[0] for x in d['comments']]
    temp['comments'] = comments_only
    top[j] = temp
    j += 1

# Clean text

In [None]:
import re
import string


def clean_text(text):
    # Remove unicode characters
    text = re.sub('[^\x00-\x7F]+', '', text)
    # Convert to python ASCII string
    text = str(text.encode('ascii'))
    # Convert to lowercase
    text = text.lower()
    # Remove new line characters
    text = text.replace('\n',' ')
    # Remove links
    text = re.sub('http\S+', '', text)
    # Remove punctuation 
    to_remove = string.punctuation.replace("'",'')
    text = re.sub('['+to_remove+']+', '', text)
    # Remove numbers
    text = re.sub('\d', '', text)
    return text
        

def clean_dictionary(dictionary):
    for _, value in dictionary.items():
        value['title'] = clean_text(value['title'])
        comments = value['comments']
        for i in range(len(comments)):
             comments[i] = clean_text(comments[i])
        value['comments'] = comments
    return dictionary

In [None]:
contro = clean_dictionary(contro)
top = clean_dictionary(top)

In [None]:
test = {key: top[key] for key in top.keys()[:1]}
print test

### Remove Contractions

In [None]:
def replace_contractions(contraction_dict, corpus_dict):
    for _, value in corpus_dict.items():
        for contr, full in contraction_dict.items():
            for i in range(len(value['comments'])):
                value['comments'][i] = re.sub(contr, full, value['comments'][i])
            value['title'] = re.sub(contr, full, value['title'])
    return corpus_dict

In [None]:
import pickle

with open('pickles/contractions.pickle','rb') as handle:
    contraction_dict = pickle.load(handle)

In [None]:
contro = replace_contractions(contraction_dict, contro)
top = replace_contractions(contraction_dict, top)

In [None]:
test = {key: top[key] for key in top.keys()[:1]}
print test

# Tokenize

In [None]:
from nltk.tokenize import WhitespaceTokenizer, word_tokenize

In [None]:
def get_tokens(corpus_dict):
    for _, value in corpus_dict.items():
        for i in range(len(value['comments'])):
            value['comments'][i] = nltk.word_tokenize(value['comments'][i])
        value['title'] = word_tokenize(value['title'])
    return corpus_dict

In [None]:
contro = get_tokens(contro)
top = get_tokens(top)

In [None]:
def counter(corpus_dict):
    count = 0
    for _, value in corpus_dict.items():
        for i in range(len(value['comments'])):
            count += len(value['comments'][i])
        count += len(value['title'])
    return count

In [None]:
print 'There are %d word occurances in the "top" corpus.' % counter(top)
print 'There are %d word occurances in the "controversial" corpus.' % counter(contro)

In [None]:
test = {key: top[key] for key in top.keys()[:1]}
print test

# Remove Stop Words

In [None]:
from nltk.corpus import stopwords

In [None]:
with open('pickles/fox_stoplist.pickle','rb') as handle:
    fox_stoplist = pickle.load(handle)
stoplist = set(fox_stoplist + stopwords.words('english'))

In [None]:
def remove_stop(corpus_dict, stoplist):
    for _, value in corpus_dict.items():
        for i in range(len(value['comments'])):
            value['comments'][i] = [word for word in value['comments'][i] 
                                    if word not in stoplist]
        value['title'] = [word for word in value['title'] 
                            if word not in stoplist]
    return corpus_dict

In [None]:
contro = remove_stop(contro, stoplist)
top = remove_stop(top, stoplist)

In [None]:
print 'There are %d non-stopword occurances in the "top" corpus.' % counter(top)
print 'There are %d non-stopword occurances in the "controversial" corpus.' % counter(contro)

### Remove contractions not caugh before (i.e. 's, 'm, etc.)

In [None]:
def remove_contractions(corpus_dict):
    pattern = re.compile("\'\w+")
    for _, value in corpus_dict.items():
        for i in range(len(value['comments'])):
            value['comments'][i] = [word for word in value['comments'][i]
                                   if not pattern.match(word)]
        value['title'] = [word for word in value['title']
                         if not pattern.match(word)]
    return corpus_dict

In [None]:
contro = remove_contractions(contro)
top = remove_contractions(top)

In [None]:
print 'There are %d non-stopword occurances in the "top" corpus.' % counter(top)
print 'There are %d non-stopword occurances in the "controversial" corpus.' % counter(contro)

In [None]:
test = {key: contro[key] for key in contro.keys()[:1]}
print test

# Stemming (not necessary)

In [None]:
from nltk.stem.porter import *

def stem_tokens(corpus_dict, stemmer):
    for _, value in corpus_dict.items():
        for i in range(len(value['comments'])):
            for j in range(len(value['comments'][i])):
                value['comments'][i][j] = stemmer.stem(value['comments'][i][j])
        for k in range(len(value['title'])):
            value['title'][k] = stemmer.stem(value['title'][k])
    return corpus_dict

In [None]:
stemmer = PorterStemmer()
contro = stem_tokens(contro, stemmer)
top = stem_tokens(top, stemmer)

In [None]:
test = {key: top[key] for key in top.keys()[:1]}
print test

# Test Uniqueness

In [None]:
# RUN TWICE: once to eliminate non unique, and twice to check the updated overlap count is 0

count = 0
for _, value_contro in contro.items():
    for key, value_top in top.items():
        if value_contro['title'] == value_top['title']:
            del top[key]
            count += 1
print count

# Word Frequencies

In [None]:
from collections import Counter

def count_words(corpus_dict):
    count = Counter()
    for _, value in corpus_dict.items():
        for i in range(len(value['comments'])):
            count.update(value['comments'][i])
        count.update(value['title'])
    return count

In [None]:
contro_count = count_words(contro)
top_count = count_words(top)

In [None]:
# Top 20 words 
zip(top_count.most_common(20), contro_count.most_common(20))

In [None]:
print len(contro_count)
print len(top_count)

### Remove words that appear 5 or less times (not currently working)

In [None]:
top_count = {word: top_count[word] for word in top_count.keys() 
             if top_count[word] > 5}
contro_count = {word: contro_count[word] for word in contro_count.keys() 
             if contro_count[word] > 5}

print len(contro_count)
print len(top_count)

# Document-Term Matrix

In [None]:
def make_doc(corpus_dict):
    doc_list = []
    for _, value in corpus_dict.items():
        doc_list.append(value['title'])
        for c in value['comments']:
            doc_list.append(c)
    return doc_list

In [None]:
top_doc = make_doc(top)
contro_doc = make_doc(contro)

### Not used....

In [None]:
def make_doc_title_only(corpus_dict):
    doc_list = []
    for _, value in corpus_dict.items():
        doc_list.append(value['title'])
    return doc_list

In [None]:
top_doc_title = make_doc_title_only(top)
contro_doc_title = make_dic_title_only(contro)

# Part of Speech Tagging - eliminates certain parts of speech

In [None]:
def pos_tag(doc):
    pos = []
    for d in doc:
        pos.append(nltk.pos_tag(d))
    return pos

def eliminate_pos(pos, chosen):
    keep_list = []
    for d in pos:
        keep = [word[0] for word in d if word[1] in chosen]
        keep_list.append(keep)
    return keep_list

In [None]:
# Only include nouns, adjectives
chosen = ['NN','NNS','NNP','NNPS']
 
top_doc_short = eliminate_pos(pos_tag(top_doc), chosen)
contro_doc_short = eliminate_pos(pos_tag(contro_doc), chosen)

# Remove words that appear in more than X% of the documents

In [None]:
# Create Counter for words and how many documents it appears in
def document_word_count(doc):
    count = Counter()
    for d in doc:
        count.update(set(d))
    return count

In [None]:
contro_doc_count = document_word_count(contro_doc)
top_doc_count = document_word_count(top_doc)
zip(top_doc_count.most_common(20), contro_doc_count.most_common(20))

In [None]:
def eliminate_common_words(doc, count):
    new_doc = []
    for d in doc:
        new = [word for word in d if ((count[word] / (1000.0 + 859.0)) < .10)]
        new_doc.append(new)
    return new_doc

In [None]:
total_count = Counter()
total_count.update(contro_doc_count)
total_count.update(top_doc_count)

contro_doc_no_common = eliminate_common_words(contro_doc, total_count)
top_doc_no_common = eliminate_common_words(top_doc, total_count)

In [None]:
contro_doc_count_no_common = document_word_count(contro_doc_no_common)
top_doc_count_no_common = document_word_count(top_doc_no_common)
zip(top_doc_count_no_common.most_common(20), contro_doc_count_no_common.most_common(20))

# Topic Modeling

## LDA

In [None]:
from gensim import corpora, models

def make_corpus(doc):
    dictionary = corpora.Dictionary(doc)
    corpus = [dictionary.doc2bow(text) for text in doc]
    return corpus, dictionary

### Train LDA model on entire corpus

In [None]:
# Combine top and controversial documents into one
total_doc = top_doc_no_common + contro_doc_no_common
corpus_total, dict_total = make_corpus(total_doc)

In [None]:
LDA_model_total = models.ldamodel.LdaModel(corpus_total, num_topics=15, 
                                           id2word = dict_total, passes=10)

In [None]:
print LDA_model_total.print_topics(num_topics=15, num_words=4)

### Look at categorization based on topics

In [None]:
def assign_topic(lda, documents, corpus, dictionary):
    topics = []
    for doc in documents:
        query = dictionary.doc2bow(doc)
        print query
        topic_dist = lda.get_document_topics(query)
        sort = sorted(topic_dist, key=lambda x: x[1], reverse=True)
        try:
            topics.append(sort[0][0])
        except:
            continue
    return topics

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

topics_given_top = assign_topic(LDA_model_total, top_doc_no_common, 
                                corpus_total, dict_total)
topics_given_contro = assign_topic(LDA_model_total, contro_doc_no_common, 
                                corpus_total, dict_total)

topic_count_top = Counter(topics_given_top)
topic_count_contro = Counter(topics_given_contro)

y_top = [(topic_count_top[topic] / float(len(topics_given_top))) for topic in topic_count_top.keys()]
y_contro = [(topic_count_contro[topic] / float(len(topics_given_contro))) for topic in topic_count_contro.keys()]
xs = np.arange(0,len(y_top))

plt.figure(figsize=(20,7))
plt.bar(xs, y_top, 0.35, label = 'top', color='b')
plt.bar(xs + 0.35, y_contro, 0.35, label = 'controversial', color='r')
plt.xlabel('Topic')
plt.ylabel('Percent of Posts')
plt.xticks(xs + 0.4,xs)
plt.legend()
plt.xlim(0,len(y_top))

print 'Total variance: ', round(sum(np.array(y_top) - np.array(y_contro)),20)

In [None]:
print LDA_model_top.print_topics(num_topics=10, num_words=4)

In [None]:
LDA_model_contro = models.ldamodel.LdaModel(corpus_contro, num_topics=20, 
                                     id2word = dict_contro)

In [None]:
print LDA_model_contro.print_topics(num_topics=10, num_words=4)

## Find Optimal Topic Number

In [None]:
# Use sigma?

# Named Entity Recognition

In [None]:
from nltk.tag import StanfordNERTagger

path = '~/Downloads/stanford-ner-2015-12-09/classifiers/'
st = StanfordNERTagger(path+'english.muc.7class.distsim.crf.ser.gz')
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

# Create TFIDF Vector

In [None]:
client = MongoClient('localhost', 27017)
db = client.reddit_database
collection_top = db.reddit_top
collection_contro = db.reddit_contro

In [None]:
collection_contro.count()

In [None]:
# Create list of comments from 'controversial' submissions
def create_docs(collection, how):
    docs = []
    for d in collection.find():
        if how == 'sep':
            docs.append(clean_text(d['title']))
            comments_only = [clean_text(x[0]) for x in d['comments']]
            docs.extend(comments_only)
        else:
            title = clean_text(d['title'])
            comments = ' '
            for c in d['comments']:
                comments = comments + ' ' + clean_text(c[0])
            docs.append(title + comments)
    return docs

In [None]:
contro_tfidf = create_docs(collection_contro, how ='together')
top_tfidf = create_docs(collection_top, how ='together')

### Get rid of posts in both document lists

In [None]:
# RUN TWICE
count = 0
for c in contro_tfidf:
    for t in top_tfidf:
        if c == t:
            count += 1
            top_tfidf.remove(t)
print count

In [None]:
len(top_tfidf)

# LDA with Count / TFIDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),
                                   lowercase = True,
                                   stop_words='english',
                                   token_pattern='\\b[a-z][a-z]+\\b',
                                   max_df=0.02, min_df=2)
dtm_tfidf = tfidf_vectorizer.fit_transform((top_tfidf + contro_tfidf)).transpose()

In [None]:
dtm_tfidf.shape   # Num doc x num features / terms

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1),
                                   stop_words='english',
                                   token_pattern='\\b[a-z][a-z]+\\b',
                                   max_df=0.05, min_df=2)
dtm_count = count_vectorizer.fit_transform((top_tfidf + contro_tfidf)).transpose()

In [None]:
dtm_count.shape

In [None]:
corpus = gensim.matutils.Sparse2Corpus(dtm_tfidf)

In [None]:
# id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.iteritems())
id2word = {}
for i,val in enumerate(tfidf_vectorizer.get_feature_names()):
    id2word[i] = val
#print id2word

In [None]:
lda = models.LdaModel(corpus, id2word=id2word, num_topics=90, passes=1)

In [None]:
lda.print_topics(num_words=6, num_topics=90)

In [None]:
lda_corpus = lda[corpus]

In [None]:
lda_docs = [doc for doc in lda_corpus]

In [None]:
def assign_topic_id2word(lda_docs):
    topics = []
    for prob_list in lda_docs:
        sort = sorted(prob_list, key=lambda x: x[1], reverse=True)
        topics.append(sort[0][0])
    return topics

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

topics_top = assign_topic_id2word(lda_docs[:len(top_tfidf)])
topics_contro = assign_topic_id2word(lda_docs[len(top_tfidf):])

topic_count_top2 = Counter(topics_top)
topic_count_contro2 = Counter(topics_contro)

y_top2 = [(topic_count_top2[topic] / float(len(topics_top))) for topic in topic_count_top2.keys()]
y_contro2 = [(topic_count_contro2[topic] / float(len(topics_contro))) for topic in topic_count_contro2.keys()]
xs = np.arange(0,len(y_top2))

plt.figure(figsize=(20,7))
plt.bar(xs, y_top2, 0.35, label = 'top', color='b')
plt.bar(xs + 0.35, y_contro2, 0.35, label = 'controversial', color='r')
plt.xlabel('Topic')
plt.ylabel('Percent of Posts')
plt.xticks(xs + 0.4,xs)
plt.legend()
plt.xlim(0,len(y_top2))

print 'Total variance: ', round(sum(np.array(y_top2) - np.array(y_contro2)),20)