In [3]:
import nltk
import pandas as pd
import re
import string
from pprint import pprint
from wordcloud import WordCloud
import gensim
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
import os

from nltk import PorterStemmer
from nltk import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

pd.set_option('display.max_colwidth', 100)
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['from'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# clean text by removing punctuations and stopwords
def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = re.split(r'\W+', text.lower())
    text = [word for word in tokens if word not in stopwords and len(word) > 3]
    return text

In [5]:
# stemming of given tokens
def stemming_text(tokenized_text):
    ps = PorterStemmer()
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [6]:
# lemmetization of given tokens
def lemmetizing_text(tokenized_text):
    wn = WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [7]:
# visualize wordnet from given list of sentences
def visualize_word_cloud(document_list):
    long_string = ','.join(list(document_list))
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
    wordcloud.generate(long_string)
    wordcloud.to_image().show()

In [8]:
# create bag of word for text
def create_bow_corpus(data_list):
    dictionary = gensim.corpora.Dictionary(data_list)
    corpus = [dictionary.doc2bow(text) for text in data_list]
    return corpus, dictionary

In [9]:
# create lda model
def create_lda_model(corpus, dictionary, num_topics):
    lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics)
    # pprint(lda_model.print_topics())
    return lda_model

In [10]:
# visualize lda topics
def visualize_topics_pyldavis(lda_model, corpus, id2word, num_topics):
    pyLDAvis.enable_notebook()

    LDAvis_data_filepath = os.path.join(r'lda_vis_files/ldavis_prepeared_' + str(num_topics))

    # # this is a bit time consuming - make the if statement True
    # # if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, r'lda_vis_files/ldavis_prepeared_'+ str(num_topics) +'.html')
    return LDAvis_prepared


In [12]:
# read the data
data = pd.read_csv('data/papers.csv', nrows=100)
# keep only impotant columns
data = data.drop(columns=['id', 'event_type', 'pdf_name', 'year', 'title', 'abstract'])
data['cleaned_text'] = data['paper_text'].apply(lambda x: clean_text(x))
data['cleaned_text_lemma'] = data['cleaned_text'].apply(lambda x: lemmetizing_text(x))
# create corpus with bag of words
corpus, dictonary = create_bow_corpus(data['cleaned_text_lemma'])
# create lda model
lda_model = create_lda_model(corpus, dictonary, num_topics=10)
# visualize topic
plda = visualize_topics_pyldavis(lda_model, corpus, dictonary, 10)
plda

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [14]:
# read the data
data = pd.read_csv('data/yelp.csv', nrows=100)
data = data.drop(columns=['business_id', 'date', 'review_id', 'stars', 'type', 'user_id'])
data['cleaned_text'] = data['text'].apply(lambda x: clean_text(x))
# lemmatize text
data['cleaned_text_lemma'] = data['cleaned_text'].apply(lambda x: lemmetizing_text(x))
# create corpus with bag of words
corpus, dictonary = create_bow_corpus(data['cleaned_text_lemma'])
# create lda model
lda_model = create_lda_model(corpus, dictonary, num_topics=5)
# visualize topic
plda = visualize_topics_pyldavis(lda_model, corpus, dictonary, 5)
plda

  by='saliency', ascending=False).head(R).drop('saliency', 1)
