In [1]:
import pickle
import gensim
#import pyLDAvis
#import pyLDAvis.gensim
import spacy
import pandas as pd
import nltk; nltk.download('stopwords')
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import re
import warnings
from pprint import pprint

unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package stopwords to /home/amy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Note this notebook runs an HDP model to find the best number of topics. It is not strictly part of the flow and can be considered optional. It also uses a smaller sample of data to run faster.

In [2]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#### Remove new lines

In [3]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]

In [4]:
#tweet_df = pd.read_csv("current-tweets_cleaned.csv")
tweet_df = pd.read_csv("crawled_83k_cleaned.csv")
# the actual preprocessed text in the tweets
tweets = tweet_df["text"]

#### Tokenize and remove punctuation

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [6]:
words_tr = list(sent_to_words(tweet_df.text))
#words_te = list(sent_to_words(tweet_df.text))

In [7]:
words_tr[21][:10]

[]

In [8]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [9]:
words_tr = remove_stopwords(words_tr)

In [10]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    trigram = gensim.models.Phrases(bigram[words], min_count = tri_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod

In [11]:
bigram_tr, trigram_tr = bigrams(words_tr)

#### Check some items

In [12]:
print(trigram_tr[bigram_tr[words_tr[7000]]][:200])
print(bigram_tr)

['wait', 'go', 'around', 'corner', 'apartment', 'building', 'liz', 'miss', 'seeing', 'old', 'man', 'pee', 'sidewalk']
<gensim.models.phrases.Phraser object at 0x7f053de3e198>


#### Remove stopwords and lemmatize

In [13]:
#nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#spacy.load('en_core_web_sm')

In [14]:
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#### Run test through trained model - will later run test data through trained model here

In [15]:
trigrams_tr = [trigram_tr[bigram_tr[review]] for review in words_tr]

In [16]:
lemma_lg = lemmatization(trigrams_tr)

In [17]:
with open('tweets_lg.pkl', 'wb') as f:
    pickle.dump(lemma_lg, f)

#### Note difference to un-lemmatized un-stop-worded above

In [18]:
lemma_lg[8811][:20]

[]

## LDA

#### Dictionary and Corpus creation for LDA

In [19]:
id2word_lg = gensim.corpora.Dictionary(words_tr)
id2word_lg.filter_extremes(no_below=10, no_above=0.35)
id2word_lg.compactify()
id2word_lg.save('train_dict_lg')
corpus_lg = [id2word_lg.doc2bow(text) for text in words_tr]

In [20]:
with open('83k_tweets_lg.pkl', 'wb') as f:
    pickle.dump(corpus_lg, f)

In [21]:
corpus_lg[21][:2]

[]

In [22]:
[(id2word_lg[id], freq) for id, freq in corpus_lg[21]]

[]

## HDP Model - auto-dinals the best number of topics

In [None]:
from gensim.models import HdpModel
hdp = HdpModel(corpus_lg, id2word_lg, chunksize=5000)

In [None]:
len(hdp.print_topics())

In [None]:
hdp.print_topics(num_topics=20)