### 1) Compare additional topic modelling with TF-IDF (Term Frequency-Inverse Document Frequency) and dimensionality reduction with NMF (Non-negative Matrix Factorization)
### 2) Visualize intertopic distance
### 3) Sum top 100 word tokens and convert to csv for D3 wordcloud visualization

In [7]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
with open('corpus_final.pkl', 'rb') as picklefile:
    df = pickle.load(picklefile)

In [3]:
df = df.sort_values(by='date')

In [15]:
n_samples = 2000
n_features = 1000
n_topics = 25
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.65, min_df=.01,
                                   max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(df.corpus)


# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.65, min_df=.01,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(df.corpus)


# Fit the NMF model

nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)


print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf-idf features for NMF...
Extracting tf features for LDA...

Topics in NMF model:
Topic #0:
love feel story ask man great guy old experience write walk person hand believe friend leave end turn moment god
Topic #1:
woman men girl sex gender female violence male man young daughter mother story boy black mom voice sister medium father
Topic #2:
sound voice noise listen hear song sing play record wave color visual baby bird note light eye speech affect example
Topic #3:
cancer patient disease doctor health drug treatment medical care surgery hospital medicine heart treat blood die trial test hiv body
Topic #4:
ocean water fish sea animal ice specie planet surface earth deep area island foot creature ship place plastic blue land
Topic #5:
music play song piece sing video end dance hear note audience okay brother sorry dream excite century lady eye ted
Topic #6:
kid school teacher student education teach learn class college math girl high graduate parent community university pro

In [17]:
# from sklearn.externals import joblib
# joblib.dump(nmf, 'nmf_model_1_25_topics_65_df.pkl') 

['nmf_model_1_25_topics_65_df.pkl']

### Visualize NMF topics. NMF produces a greater range of intertopic distance/variation than previous LDA with count-vectorizer. This will be the final model. 

In [58]:
import pyLDAvis, pyLDAvis.sklearn
from IPython.display import display

# Setup to run in Jupyter notebook
pyLDAvis.enable_notebook()

# Create the visualization
vis = pyLDAvis.sklearn.prepare(nmf, tf, tfidf_vectorizer)

# Export as a standalone HTML web page
pyLDAvis.save_html(vis, 'nfm_65_df.html')

# Let's view it!
display(vis)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
  relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
  relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift


### Sum vectorized tokens, convert to dictionary and write out to CSV file for wordcloud visualization

In [22]:
df_counts = pd.DataFrame(tf.toarray(),columns=tfidf_vectorizer.get_feature_names()).T

In [28]:
len(df_counts)

5391

In [24]:
df_counts['total'] = df_counts.sum(axis=1)
df_total = df_counts['total'].sort_values(ascending=False)
counts = dict(df_total)

In [27]:
words = sorted(counts, key=counts.get, reverse=True)
highest = None
lowest = None
for w in words[:100]: # top 100 words
  if highest is None or highest < counts[w] :
    highest = counts[w]
  if lowest is None or lowest > counts[w] :
    lowest = counts[w]

# Spread the font sizes across 20-100 based on the count
bigsize = 80
smallsize = 20

fhand = open('gword.js','w')
fhand.write("gword = [")
first = True
for k in words[:100]:
  if not first : fhand.write( ",\n")
  first = False
  size = counts[k]
  size = (size - lowest) / float(highest - lowest)
  size = int((size * bigsize) + smallsize)
  try:
    fhand.write("{text: '"+k+"', size: "+str(size)+"}")
  except UnicodeEncodeError: continue
fhand.write( "\n];\n")

4

### Count total words

In [48]:
text = list(df.text)

In [49]:
flat = []
for i in text:
    flat.extend(i)

In [52]:
everything = ''.join(flat)

In [55]:
everything = everything.split()

In [56]:
len(everything) # almost 5 million words from the original unprocessed corpus

4759385