In [2]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

# Initialize variables
n_samples = 2000
n_features = 1000
n_topics = 10

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]

# use tf feature for LDA model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [4]:
lda.transform(tf)

array([[  3.44893409e-03,   6.28598196e-01,   3.44908103e-03, ...,
          3.44868917e-03,   3.44943514e-03,   3.44884274e-03],
       [  3.33391110e-03,   3.33467317e-03,   9.69993827e-01, ...,
          3.33448122e-03,   3.33413230e-03,   3.33356505e-03],
       [  3.03086351e-03,   6.71061551e-01,   3.03059022e-03, ...,
          3.03054785e-03,   3.03076742e-03,   3.03062719e-03],
       ..., 
       [  2.08357969e-03,   8.84390569e-02,   2.08344836e-03, ...,
          2.08376930e-03,   2.08360955e-03,   2.08370167e-03],
       [  6.53656983e-04,   6.53705973e-04,   6.53712556e-04, ...,
          6.53665594e-04,   7.88066707e-01,   6.53729463e-04],
       [  2.00001102e-02,   2.00062768e-02,   2.21624245e-01, ...,
          2.00018814e-02,   6.18358729e-01,   2.00023251e-02]])

In [5]:
n_topics

10

In [15]:
import numpy as np

In [17]:
import gensim



In [33]:
def display_topics(model, feature_names, no_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [34]:
display_topics(lda, tf_vectorizer.get_feature_names(), 10)

Topic 0:
edu com mail send graphics ftp pub available contact university
Topic 1:
don like just know think ve way use right good
Topic 2:
christian think atheism faith pittsburgh new bible radio games alt
Topic 3:
drive disk windows thanks use card drives hard version pc
Topic 4:
hiv health aids disease april medical care research 1993 light
Topic 5:
god people does just good don jesus say israel way
Topic 6:
55 10 11 18 15 team game 19 period play
Topic 7:
car year just cars new engine like bike good oil
Topic 8:
people said did just didn know time like went think
Topic 9:
key space law government public use encryption earth section security


In [35]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]

In [73]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].str.replace('\s+', ' ')
    df[text_field] = df[text_field].str.replace("\'", "")
    return df

In [38]:
import pandas as pd

In [42]:
data=pd.DataFrame(doc_complete,columns=['text'])

In [43]:
data=standardize_text(data,'text')

In [44]:
data

Unnamed: 0,text
0,sugar is bad to consume my sister likes to ha...
1,my father spends a lot of time driving my sist...
2,doctors suggest that driving may cause increas...
3,sometimes i feel pressure to perform well at s...
4,health experts say that sugar is not good for ...


In [45]:
from nltk import word_tokenize,WordNetLemmatizer

In [46]:
stopwords=nltk.corpus.stopwords.words('english')
wn=WordNetLemmatizer()

In [47]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [52]:
data['body_text_clean'] = data['text'].apply(lambda x: clean_text(x))

In [51]:
import string,re

In [53]:
data

Unnamed: 0,text,body_text_clean
0,sugar is bad to consume my sister likes to ha...,"[sugar, bad, consume, sister, like, sugar, fat..."
1,my father spends a lot of time driving my sist...,"[father, spends, lot, time, driving, sister, a..."
2,doctors suggest that driving may cause increas...,"[doctor, suggest, driving, may, cause, increas..."
3,sometimes i feel pressure to perform well at s...,"[sometimes, feel, pressure, perform, well, sch..."
4,health experts say that sugar is not good for ...,"[health, expert, say, sugar, good, lifestyle, ]"


In [58]:
tfidf_vect = CountVectorizer(analyzer=clean_text,ngram_range=(1,2))
tfidf_vect_fit = tfidf_vect.fit(data['text'])
tfidf_train = tfidf_vect_fit.transform(data['text'])

In [65]:
lda = LatentDirichletAllocation(n_components=3, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tfidf_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=3, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [66]:
display_topics(lda, tfidf_vect_fit.get_feature_names(), )

Topic 0:
 good increased
Topic 1:
father sister sugar
Topic 2:
lot dance spends


In [67]:
from gensim import corpora

In [None]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [69]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [74]:
df=standardize_text(df,'content')

In [75]:
df.head()

Unnamed: 0,content,target,target_names
0,from lerxst (wheres my thing) subject what car...,7,rec.autos
1,from guykuo (guy kuo) subject si clock poll fi...,4,comp.sys.mac.hardware
10,from irwin (irwin arnstein) subject re recomme...,8,rec.motorcycles
100,from tchen (tsung kun chen) subject software f...,6,misc.forsale
1000,from dabl2 (don a b lindbergh) subject diamond...,2,comp.os.ms-windows.misc


In [76]:
df['body'] = df['content'].apply(lambda x: clean_text(x))

In [77]:
df.head()

Unnamed: 0,content,target,target_names,body
0,from lerxst (wheres my thing) subject what car...,7,rec.autos,"[lerxst, wheres, thing, subject, car, nntp, po..."
1,from guykuo (guy kuo) subject si clock poll fi...,4,comp.sys.mac.hardware,"[guykuo, guy, kuo, subject, si, clock, poll, f..."
10,from irwin (irwin arnstein) subject re recomme...,8,rec.motorcycles,"[irwin, irwin, arnstein, subject, recommendati..."
100,from tchen (tsung kun chen) subject software f...,6,misc.forsale,"[tchen, tsung, kun, chen, subject, software, f..."
1000,from dabl2 (don a b lindbergh) subject diamond...,2,comp.os.ms-windows.misc,"[dabl2, b, lindbergh, subject, diamond, ss24x,..."




<gensim.interfaces.TransformedCorpus object at 0x0000000029616A90>


In [100]:
data = df.body.values.tolist()

In [102]:
bigram = gensim.models.Phrases(data, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data]])



<gensim.interfaces.TransformedCorpus object at 0x0000000029616828>


In [103]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [104]:
data_big=make_bigrams_bigrams(data)

In [106]:
id2word = corpora.Dictionary(data_big)

In [107]:
corpus = [id2word.doc2bow(text) for text in data_big]

In [83]:
id2word.

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1)]]


In [108]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [96]:
import pprint

In [110]:
print(lda_model.print_topics())
#doc_lda = lda_model[corpus]

[(0, '0.035*"max" + 0.015*"card" + 0.013*"system" + 0.012*"drive" + 0.009*"software" + 0.008*"use" + 0.007*"standard" + 0.007*"info" + 0.007*"available" + 0.006*"help"'), (1, '0.018*"would" + 0.016*"one" + 0.012*"dont" + 0.010*"time" + 0.010*"know" + 0.009*"like" + 0.008*"think" + 0.008*"people" + 0.007*"writes" + 0.007*"well"'), (2, '0.023*"god" + 0.020*"jesus" + 0.015*"christian" + 0.009*"life" + 0.007*"word" + 0.007*"book" + 0.006*"believe" + 0.006*"law" + 0.006*"bible" + 0.006*"belief"'), (3, '0.123*"q" + 0.074*"mr_stephanopoulos" + 0.018*"r" + 0.013*"gr" + 0.010*"cub_suck" + 0.008*"4m" + 0.008*"anger" + 0.008*"h" + 0.007*"l" + 0.006*"postscript"'), (4, '0.030*"00" + 0.026*"26" + 0.022*"ca" + 0.019*"religion" + 0.012*"king" + 0.011*"34" + 0.009*"hour" + 0.008*"directory" + 0.008*"mission" + 0.008*"st"'), (5, '0.019*"game" + 0.017*"team" + 0.014*"year" + 0.010*"player" + 0.008*"win" + 0.008*"play" + 0.007*"season" + 0.007*"jew" + 0.006*"first" + 0.005*"nhl"'), (6, '0.022*"car" + 0.0

In [93]:
doc_lda

<gensim.interfaces.TransformedCorpus at 0x29616f60>