### Project 4
### Brendon Happ
### NLP

In [3]:
import pandas as pd
import numpy as np
import os
import pickle
import re
from smart_open import smart_open

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities, matutils
from gensim.models.phrases import Phraser, Phrases
from gensim.models.ldamulticore import LdaMulticore

In [87]:
full_df_list = []

for (dirname, dirs, files) in os.walk('clean_pickles'):
    for filename in files:
        with open(os.path.join('clean_pickles', filename), 'rb') as f:
            full_df_list.append(pd.read_pickle(f))

In [88]:
full_df = pd.concat(full_df_list, axis=0, ignore_index=True)

In [89]:
full_df.columns

Index(['author', 'content', 'region', 'language', 'publish_date', 'following',
       'followers', 'updates', 'post_type', 'account_type', 'retweet',
       'account_category', 'new_june_2018', 'tweet_id'],
      dtype='object')

In [6]:
#def remove_stopwords(df):
#    df = ' '.join(word for word in df.split() if word not in words)
#    return df

In [7]:
#clean_col = full_df['content'].apply(remove_stopwords)
#full_df.loc[:, 'content'] = clean_col.values


In [5]:
tweets = full_df.content

### Tokenization

First, tokenize the documents, remove common words 

In [90]:
tweets.shape

(2116866,)

In [91]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [92]:
stemmer = SnowballStemmer("english")

In [93]:
def preprocess_text(text):
    '''
    Tokenises, and lemmatize's using spacy. Returns a string of space seperated tokens.
    '''
    #words = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = nlp(text)
    stops = set(stopwords.words("english"))

    result = []
    global cache
    for word in words:
        # Memoization 
        if word in stops:
            continue
        elif len(word) == 1:
            continue
        elif len(word) == 2:
            continue
        elif word not in cache:
            lemma = str(word.lemma_) if word.lemma_ != "-PRON-" else str(word)
            cache[word] = lemma
        else:
            lemma = cache[word]
        result.append(lemma)
    return " ".join(result)

In [None]:
tokenized_tweets = []
cache = {}
for tweet in tweets:
    tokenized_tweets.append(preprocess_text(tweet))

In [131]:
#def ngrams_split(lst, n):
#    return [' '.join(lst[i:i+n]) for i in range(len(lst)-n)]

In [132]:
#bigram_tweets = []
#for tweet in tokenized_tweets:
#    bigram_tweets.append(ngrams_split(tweet.split(), 2))

### Topic/Concept Modeling

To convert documents to vectors, use a **bag of words** representation. In this representation, each document is represented by one vector where a vector element i represents the number of times the ith word appears in the document.

It is advantageous to represent the questions only by their (integer) ids. The mapping between the questions and ids is called a dictionary:

In [133]:
gensim_tweets = []
for tweet in tokenized_tweets:
    tweet_list = tweet.split()
    gensim_tweets.append(tweet_list)

**Add bigrams to gensim formatted tweets**

In [194]:
#phrases = Phrases(gensim_tweets, min_count=3, threshold=100)
#bigram = Phraser(phrases)

In [195]:
#bi_tweets = []
#for sent in bigram[gensim_tweets]:
#    bi_tweets.append(sent)

In [4]:
with open('bi_tweets.pickle', 'rb') as f:
    bi_tweets = pd.read_pickle(f)

**Remove Stops**

In [5]:
stops = set(stopwords.words("english"))

In [16]:
filtered_tweets = []
for sent in bi_tweets:
    filtered_tweets.append([word for word in sent if word not in stops])

In [18]:
with open('filtered_bi_tweets.pickle', 'wb') as file:
    pickle.dump(filtered_tweets, file, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
len(filtered_tweets)

2116866

In [20]:
dictionary = corpora.Dictionary(filtered_tweets)
dictionary.save(os.path.join('temp_folder', 'tweets.dict'))  # store the dictionary, for future reference
print(dictionary)

Dictionary(396124 unique tokens: ['attack', 'bank', 'home', 'israeli', 'news']...)


**Filter out words that appear in less than 100 documents and more than 50% of the corpus**

In [21]:
dictionary.filter_extremes(no_below=100, no_above=0.5)
dictionary.compactify()

**The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a bag-of-words--a sparse vector, in the form of [(word_id, word_count), ...].**

**doc2bow() has similar behaviors as calling transform() on CountVectorizer. doc2bow() can behave like fit_transform() as well.**

In [22]:
def bag_of_words_generator(lst, dictionary):
    for i in lst: 
        yield dictionary.doc2bow(i)

In [23]:
# store to disk, for later use
corpora.MmCorpus.serialize(os.path.join('temp_folder', 'tweets.mm'), bag_of_words_generator(filtered_tweets, dictionary))  
corpus = corpora.MmCorpus(os.path.join('temp_folder', 'tweets.mm'))


In [24]:
corpus.num_terms

10667

In [25]:
corpus.num_docs

2116866

In [26]:
lda = LdaMulticore(corpus, num_topics=5, id2word=dictionary, workers=7)


In [27]:
for idx, topic in lda.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.017*"hillary" + 0.016*"video" + 0.010*"love" + 0.010*"say" + 0.009*"see" + 0.009*"make" + 0.009*"one" + 0.008*"medium" + 0.008*"world" + 0.008*"know"


Topic: 1 
Words: 0.026*"get" + 0.016*"people" + 0.015*"black" + 0.013*"like" + 0.010*"good" + 0.008*"life" + 0.007*"need" + 0.007*"think" + 0.007*"work" + 0.007*"white"


Topic: 2 
Words: 0.031*"world" + 0.026*"sport" + 0.012*"news" + 0.012*"new" + 0.009*"day" + 0.008*"win" + 0.007*"play" + 0.007*"game" + 0.006*"year" + 0.006*"first"


Topic: 3 
Words: 0.046*"news" + 0.018*"police" + 0.016*"man" + 0.015*"kill" + 0.014*"topnew" + 0.013*"world" + 0.009*"say" + 0.008*"shoot" + 0.007*"woman" + 0.006*"charge"


Topic: 4 
Words: 0.043*"trump" + 0.015*"obama" + 0.011*"topnew" + 0.010*"news" + 0.010*"president" + 0.009*"say" + 0.008*"world" + 0.008*"vote" + 0.007*"amp" + 0.007*"politic"


