In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os

os.chdir('..')

# Read data into papers
papers = pd.read_csv('/content/drive/MyDrive/sentisum/sentisum-assessment-dataset.csv')
# Print head
papers.head()

Unnamed: 0,text
0,Tires where delivered to the garage of my choi...
1,"Easy Tyre Selection Process, Competitive Prici..."
2,Very easy to use and good value for money.
3,Really easy and convenient to arrange
4,It was so easy to select tyre sizes and arrang...


In [None]:
# Load the regular expression library
import re

# Remove punctuation
papers['paper_text_processed'] = papers['text'].map(lambda x: re.sub('[,\.!?]', ' ', x))
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
# Convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
papers['paper_text_processed'].head()


0    tires where delivered to the garage of my choi...
1    easy tyre selection process  competitive prici...
2           very easy to use and good value for money 
3                really easy and convenient to arrange
4    it was so easy to select tyre sizes and arrang...
Name: paper_text_processed, dtype: object

In [None]:
fg = papers['paper_text_processed']

In [None]:
fg[10129]

'i ordered the tyre i needed on line  booked a specified time at a local garage and i had the tyre fitted  all worked very well  to time  and i would use  again  good price for the tyre  too  as i did a quick search on-line '

In [None]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['tires', 'where', 'delivered', 'to', 'the', 'garage', 'of', 'my', 'choice', 'the', 'garage', 'notified', 'me', 'when', 'they', 'had', 'been', 'delivered', 'day', 'and', 'time', 'was', 'arranged', 'with', 'the', 'garage', 'and', 'went', 'and', 'had']


In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['tire', 'deliver', 'garage', 'choice', 'garage', 'notify', 'deliver', 'day', 'time', 'arrange', 'garage', 'go', 'fit', 'free', 'experience']


In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 3), (8, 1), (9, 1), (10, 1), (11, 1)]


In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=30, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)
                                       

In [None]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(26,
  '0.137*"tyre" + 0.110*"day" + 0.069*"fit" + 0.067*"new" + 0.049*"garage" + '
  '0.044*"order" + 0.040*"car" + 0.033*"get" + 0.029*"later" + 0.023*"go"'),
 (8,
  '0.073*"much" + 0.041*"complaint" + 0.039*"already" + 0.036*"supply" + '
  '0.034*"agree" + 0.032*"else" + 0.032*"run" + 0.028*"morning" + 0.025*"late" '
  '+ 0.025*"usual"'),
 (14,
  '0.399*"use" + 0.227*"easy" + 0.106*"website" + 0.078*"always" + '
  '0.025*"star" + 0.020*"garage" + 0.017*"clear" + 0.013*"pricing" + '
  '0.011*"full" + 0.010*"premium"'),
 (15,
  '0.524*"good" + 0.204*"price" + 0.047*"tyre" + 0.047*"fitting" + '
  '0.024*"local" + 0.021*"garage" + 0.019*"experience" + 0.017*"centre" + '
  '0.006*"specify" + 0.005*"michelin"'),
 (13,
  '0.262*"choice" + 0.236*"customer" + 0.111*"service" + 0.055*"front" + '
  '0.041*"perfect" + 0.019*"satisfied" + 0.017*"progress" + 0.016*"wide" + '
  '0.014*"various" + 0.014*"manufacturer"'),
 (23,
  '0.101*"time" + 0.095*"first" + 0.058*"know" + 0.053*"even" + 0.050*"

In [None]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.45971321723251796


In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 12
max_topics = 30
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    print(cv)
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('/content/drive/MyDrive/sentisum/lda_tuning_results.csv', index=False)
    pbar.close()


  diff = np.log(self.expElogbeta)
Process ForkPoolWorker-2:
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.7/multiprocessing/pool.py", line 105, in worker
    initializer(*initargs)
  File "/usr/local/lib/python3.7/dist-packages/gensim/models/ldamulticore.py", line 333, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/usr/local/lib/python3.7/dist-packages/gensim/models/ldamodel.py", line 725, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/usr/local/lib/python3.7/dist-packages/gensim/models/ldamodel.py", line 677, in inference
    Elogthetad = dirichlet_expectation(gammad)
KeyboardInterrupt


KeyboardInterrupt: ignored

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=30,
                                           alpha='symmetric',
                                           eta=0.91)

In [None]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.086*"process" + 0.042*"communication" + 0.024*"whole" + 0.019*"perfect" + '
  '0.014*"ordering" + 0.008*"smooth" + 0.006*"regard" + '
  '0.004*"straightforward" + 0.004*"simple" + 0.002*"stuff"'),
 (1,
  '0.044*"money" + 0.038*"professional" + 0.035*"start" + 0.032*"finish" + '
  '0.031*"save" + 0.012*"easily" + 0.009*"amazing" + 0.009*"value" + '
  '0.005*"faultless" + 0.004*"painless"'),
 (2,
  '0.119*"recommend" + 0.071*"would" + 0.056*"definitely" + 0.038*"helpful" + '
  '0.034*"highly" + 0.032*"staff" + 0.024*"friendly" + 0.019*"service" + '
  '0.013*"excellent" + 0.012*"friend"'),
 (3,
  '0.019*"rate" + 0.012*"continue" + 0.010*"motor" + 0.006*"guarantee" + '
  '0.006*"performance" + 0.005*"grip" + 0.004*"sport" + 0.004*"independent" + '
  '0.004*"noise" + 0.004*"vast"'),
 (4,
  '0.042*"deal" + 0.020*"smoothly" + 0.016*"transaction" + 0.008*"go" + '
  '0.007*"impressed" + 0.006*"anywhere_else" + 0.006*"tyer" + 0.005*"serious" '
  '+ 0.003*"contract" + 0.003*"pleasure"')

In [None]:
topics = lda_model.print_topics()
tot = []
for topic in topics:
    tot.append(topic)

In [None]:
import pandas as pd
ee = pd.DataFrame(tot)
ee.head()

Unnamed: 0,0,1
0,0,"0.086*""process"" + 0.042*""communication"" + 0.02..."
1,1,"0.044*""money"" + 0.038*""professional"" + 0.035*""..."
2,2,"0.119*""recommend"" + 0.071*""would"" + 0.056*""def..."
3,3,"0.019*""rate"" + 0.012*""continue"" + 0.010*""motor..."
4,4,"0.042*""deal"" + 0.020*""smoothly"" + 0.016*""trans..."


In [None]:
ee.to_csv("final.csv")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/sentisum/sentisum-assessment-dataset.csv")

In [None]:
td = []
td = df["text"]
td

0        Tires where delivered to the garage of my choi...
1        Easy Tyre Selection Process, Competitive Prici...
2               Very easy to use and good value for money.
3                    Really easy and convenient to arrange
4        It was so easy to select tyre sizes and arrang...
                               ...                        
10127    I ordered the wrong tyres, however [REDACTED] ...
10128    Good experience, first time I have used [REDAC...
10129    I ordered the tyre I needed on line, booked a ...
10130    Excellent service from point of order to fitti...
10131    Seamless, well managed at both ends. I would r...
Name: text, Length: 10132, dtype: object

In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 3]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [None]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
pred = []
for sent in td:
  new_doc = sent
  new_doc = prepare_text_for_lda(new_doc)
  new_doc_bow = id2word.doc2bow(new_doc)
  pred.append(lda_model.get_document_topics(new_doc_bow))

In [None]:
rows = zip(td, pred)

In [None]:
import csv

with open("Hemlo.csv", "w") as f:
    writer = csv.writer(f)
    for row in rows:
        writer.writerow(row)

In [None]:
pip install pyLDAvis



In [None]:
num_topics=10

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import pickle 


# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./content/drive/MyDrive/sentisum/ldavis_tuned_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './content/drive/MyDrive/sentisum/ldavis_tuned_'+ str(num_topics) +'.html')

LDAvis_prepared

  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps
  by='saliency', ascending=False).head(R).drop('saliency', 1)
