In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils  import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy

#plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [5]:
#prepare the stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])

In [7]:
#import dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())

['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


In [8]:
#overview of text data
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [10]:
# data clean => remove emails and newline charaters
data = df.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [12]:
# tokenize words and clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence),deacc=True))

data_words = list(sent_to_words(data))
pprint(data_words[:1])

[['from',
  'wheres',
  'my',
  'thing',
  'subject',
  'what',
  'car',
  'is',
  'this',
  'nntp',
  'posting',
  'host',
  'rac',
  'wam',
  'umd',
  'edu',
  'organization',
  'university',
  'of',
  'maryland',
  'college',
  'park',
  'lines',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'saw',
  'the',
  'other',
  'day',
  'it',
  'was',
  'door',
  'sports',
  'car',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  'early',
  'it',
  'was',
  'called',
  'bricklin',
  'the',
  'doors',
  'were',
  'really',
  'small',
  'in',
  'addition',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  'this',
  'is',
  'all',
  'know',
  'if',
  'anyone',
  'can',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'of',
  'production',
  'where',
  'this',
  'car',
  'is',
  'made',
  'history',
  'or',
  'whatever',
  

In [15]:
#build bigram data and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold= 100)
trigram = gensim.models.Phrases(bigram[data_words],threshold=100)

bigram_mode = gensim.models.phrases.Phraser(bigram)
trigram_mode = gensim.models.phrases.Phraser(trigram)

# see result
print(bigram_mode)



NameError: name 'bigram_mod' is not defined

In [20]:
# remove stopwords, combine bigrams and trigrams and use lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mode[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mode[bigram_mode[doc]] for doc in texts]

def lemmatization(texts, allowed_postages=["NOUN","ADJ","VERB","ADV"]):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postages])
    return texts_out

In [22]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en',disable=['parser','ner'])

# do lemmatization, keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [24]:
# create dictionary and corpus needed for topic modeling
# create dictionary
id2word = corpora.Dictionary(data_lemmatized)
#create corpus
texts = data_lemmatized
#term document frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [32]:
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 5), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1)]]


In [36]:
# use mallet model to do topic extraction
mallel_path = './mallet-2.0.6/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallel_path,corpus=corpus,num_topics=20, id2word=id2word)

In [41]:
pprint(ldamallet.show_topics(formatted=False))


[(14,
  [('gun', 0.020417240587695132),
   ('state', 0.01493629476584022),
   ('law', 0.013874540863177227),
   ('people', 0.012009297520661157),
   ('article', 0.007848370064279155),
   ('make', 0.007475321395775942),
   ('write', 0.007417929292929293),
   ('fire', 0.00726010101010101),
   ('crime', 0.0071883608815426995),
   ('weapon', 0.006815312213039486)]),
 (1,
  [('ax', 0.8532514838840757),
   ('max', 0.06244648584323799),
   ('tm', 0.002278433250613145),
   ('qax', 0.0020317239177442058),
   ('giz', 0.0010158619588721029),
   ('mf', 0.00100134964517393),
   ('_', 0.000870738821890374),
   ('ml', 0.0007836649397013366),
   ('gq', 0.0007691526260031636),
   ('wm_wm', 0.0007691526260031636)]),
 (2,
  [('window', 0.01775762572135202),
   ('line', 0.01749381698268755),
   ('file', 0.016224237427864798),
   ('program', 0.012844187963726298),
   ('set', 0.011838417147568014),
   ('problem', 0.010255564715581204),
   ('entry', 0.00852431986809563),
   ('write', 0.008375927452596868),
 

In [43]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step =3):
    coherence_values=[]
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path=mallel_path,texts=texts,num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [50]:
# find dominant topic in each sentence
def format_topics_sentences(ldamodel=ldamallet,corpus = corpus, texts=data):
    #init output
    sent_topics_df = pd.DataFrame()
    
    # get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x:(x[1]), reverse=True)
        #get dominant topic, prec contribution and keywords for each document
        for j,(topic_num, prop_topic) in enumerate(row):
            if j==0: #dorminant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ",".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),round(protopic,4),topic_keywords],ignore_index=True))
            else:
                break
        sent_topics_df.columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df], axis=1)
        return(sent_topics_df)
        

In [53]:
th = ldamallet[corpus]

In [57]:
len(th[0])

20