In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [None]:
for w in ['dogs', 'ran', 'discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
import random
text_data = []
with open('/content/drive/MyDrive/sentisum/sentisum-assessment-dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['redact', 'recommend', 'family', 'member', 'good', 'experience', 'well', 'would', 'happy', 'also']
['fast', 'organise', 'fitting']
['good', 'price', 'exact', 'tyre', 'want', 'available', 'could', 'fit', 'locally', 'site', 'issue', 'pass', 'phone', 'call']
['website', 'easy', 'quick', 'tyre', 'specs', 'list', 'along', 'grade', 'efficiency', 'fuel', 'consumption', 'along', 'noise', 'tyre', 'mobile', 'tyre', 'fitting', 'option', 'also', 'available', 'perfect', 'situation']
['excellent', 'tyre', 'discount', 'easy', 'navigation', 'website', 'great', 'discount', 'well', 'know', 'tyres•', 'wide', 'range', 'tyre', 'size', 'match', 'vehicle', 'choice', 'deliver', 'garage', 'fit', 'fitted•', 'quick', 'response', 'date', 'time', 'booking', 'local', 'garage', 'easy', 'payment', 'method']
['broad', 'choice', 'tyre', 'size', 'require', 'competitive', 'price', 'price', 'band', 'good', 'communication', 'purchase', 'fitting', 'arrangement', 'choice', 'local', 'tyre', 'fitter']
['order', 'pay', 'line',

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [None]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [None]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
import gensim
NUM_TOPICS = 15
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [None]:
topics = ldamodel.print_topics(num_words=15)

for topic in topics:
    print(topic)

[(0, '0.055*"tyre" + 0.030*"always" + 0.029*"discount" + 0.025*"easy" + 0.022*"choice" + 0.022*"garage" + 0.022*"fit" + 0.022*"clear" + 0.022*"website" + 0.015*"excellent" + 0.015*"order" + 0.015*"look" + 0.015*"quick" + 0.015*"range" + 0.015*"fully"'), (1, '0.038*"tyre" + 0.033*"garage" + 0.020*"great" + 0.019*"redact" + 0.019*"would" + 0.018*"hassle" + 0.015*"price" + 0.015*"excellent" + 0.015*"fitting" + 0.014*"recommend" + 0.014*"charge" + 0.014*"select" + 0.014*"online" + 0.014*"order" + 0.014*"experience"'), (2, '0.048*"easy" + 0.034*"following" + 0.034*"morning" + 0.017*"value" + 0.017*"competitive" + 0.017*"convenient" + 0.017*"ordering" + 0.017*"option" + 0.017*"money" + 0.017*"awesome" + 0.017*"start" + 0.017*"fitting" + 0.017*"finish" + 0.017*"reliable" + 0.017*"good"'), (3, '0.047*"bolt" + 0.035*"wheel" + 0.024*"tyre" + 0.024*"side" + 0.024*"good" + 0.024*"torque" + 0.024*"wrench" + 0.024*"tight" + 0.015*"service" + 0.012*"1hour" + 0.012*"long" + 0.012*"little" + 0.012*"all

In [None]:
new_doc = 'garage good tyre low cost'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(3, 1), (24, 1), (48, 1), (309, 1)]
[(0, 0.013333383), (1, 0.013333353), (2, 0.013333337), (3, 0.013333345), (4, 0.30809122), (5, 0.013333334), (6, 0.013333337), (7, 0.01333334), (8, 0.013333361), (9, 0.013333356), (10, 0.013333346), (11, 0.01333335), (12, 0.013333334), (13, 0.51857525), (14, 0.013333334)]


In [None]:
!pip install pyLDAvis



In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_display = gensimvis.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps
  by='saliency', ascending=False).head(R).drop('saliency', 1)
