In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

np.random.seed(101)
rand_seed = 101

In [3]:
%%time
ROOT = "./csv"
df = pd.read_csv(
    ROOT + "/propertweets.csv", 
    encoding='utf-8',
    nrows = 100000,
    low_memory=False, 
    parse_dates=False
)

Wall time: 798 ms


## NMF: Non-negative Matrix Factorization

In [4]:
vectorizer = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(1, 2), stop_words='english')
vz = vectorizer.fit_transform(df['tokens'])

In [5]:
nmf = NMF(
    n_components=50, 
    random_state=1, 
    alpha=.1, 
    l1_ratio=.5, 
    init='nndsvd'
)

nmf.fit(vz)

NMF(alpha=0.1, beta_loss='frobenius', init='nndsvd', l1_ratio=0.5,
  max_iter=200, n_components=50, random_state=1, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [6]:
feature_names = vectorizer.get_feature_names()
no_top_words = 10

for topic_idx, topic in enumerate(nmf.components_[:10]):
    print("Topic %d:"% (topic_idx + 1))
    print(", ".join([feature_names[i]
                    for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print()

Topic 1:
deficits, deficits isn, senate spending, spending president, paul senate, obama deficits, deficits republican, republican deficits, definition hypocrisy, hypocrisy

Topic 2:
obama watergate, turn obama, memo turn, fisa memo, turn, watergate, donaldjtrumpjr fisa, donaldjtrumpjr, fisa, memo

Topic 3:
dollar deficits, hand, trillion dollar, dollar, trillion, deficits, critical president, office critical, obama trillion, hand hand

Topic 4:
media, role spying, media panicking, desperate cover, panicking desperate, cover role, scandal timed, timed, sw rats, timed strategic

Topic 5:
ago, weeks ago, weeks, week, ago trump, week ago, spousal abuse, mueller lawyer, nfive, nsi weeks

Topic 6:
edited fbi, edited, obama edited, gun obama, gun, fbi documents, moking gun, moking, documents, fbi

Topic 7:
post, read, better read, post hear, wash post, read ny, ll read, read folks, times wash, post huff

Topic 8:
sen rand, sen, rand paul, paul, rand, trillion deficit, ta scam, scam, voted tr