In [8]:
# Commands to run

#nltk.download('stopwords')
#!pip install glove_python
#!pip3 install glove
#!python -m spacy download en # one time run
#!python -m pip install --upgrade pip

Requirement already up-to-date: pip in c:\users\nb24634\appdata\local\continuum\miniconda3\envs\master_thesis\lib\site-packages (18.1)


In [5]:
import gensim
from glove import Glove
from glove import Corpus
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pyLDAvis.gensim
import spacy
import warnings
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, Phrases
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from nltk import FreqDist
from nltk.corpus import stopwords
from spacy.lang.en import English

# fixed bars
stop_words = stopwords.words('english')
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
nlp = spacy.load('en', disable=['parser', 'ner'])
%matplotlib inline

ModuleNotFoundError: No module named 'glove'

In [None]:
#use this to read the sampled file
df_sampled = pd.read_excel('excel_for_topic_modeling.xlsx', sheet_name="Sheet1")
df_sampled = df_sampled.sample(frac=0.25, random_state=1) #working with 0.01% of the total dataset 

In [None]:
df_sampled.shape[0]

## Function

In [None]:
# function to plot most frequent terms
def freq_words(x, terms = 30):
  all_words = ' '.join([text for text in x])
  all_words = all_words.split()

  fdist = FreqDist(all_words)
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

  # selecting top 20 most frequent words
  d = words_df.nlargest(columns="count", n = terms) 
  plt.figure(figsize=(20,5))
  ax = sns.barplot(data=d, x= "word", y = "count")
  ax.set(ylabel = 'Count')
  plt.show()
    
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

def pos(texts, tags=['ADV', 'ADJ']): # filter noun and adjective
    output = []
    for sent in texts:
         doc = nlp(" ".join(sent)) 
         output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

## Preprocessing ##

def createUniqueText(clean_text):
    text = ""
    for sentence in clean_text:
        for word in sentence:
            text = text + ' ' + word
        text = text + "\n"
    return text

## Glove Functions ##
                         
def read_corpus(filename):
    """
    Read corpus from regular text file
    """
    delchars = [chr(c) for c in range(256)]
    delchars = [x for x in delchars if not x.isalnum()]
    delchars.remove(' ')
    delchars = ' '.join(delchars)
    table = str.maketrans(dict.fromkeys(delchars))
    
    with open(filename, 'r') as datafile:
        for line in datafile:
            yield line.lower().translate(table).split(' ')

## Data Cleaning

In [None]:
# remove unwanted characters, numbers and symbols
df_sampled['review'] = df_sampled['review'].str.replace("[^a-zA-Z#]", " ")

#removing nan
df_sampled.dropna()

#convert everything to str
df_sampled['review_modified'] = df_sampled['review'].astype(str)

# remove short words (length < 3)
df_sampled['review_modified'] = df_sampled['review_modified'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

#Removing Stop Words
df_sampled['review_modified'] = [remove_stopwords(r.split()) for r in df_sampled['review_modified']]

#lower_case
df_sampled['review_modified'] = [r.lower() for r in df_sampled['review_modified']]

clean_text = df_sampled['review_modified'].apply(lambda x: x.split())

## Preprocessing Data

In [None]:
# Part of speech
# text_pos retains all sentences(reviews) with tokens
%time
text_pos = pos(clean_text, tags=['ADV', 'ADJ'])

for x in text_pos:
    if 'alexa' in x:
        x.remove("alexa")

In [None]:
#stopwords because spacy better than nltk
nlp = spacy.load('en')
my_stop_words = [u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [None]:
# create unique text for gensim
# since the file created is too big we need to split in half
uniqueString = createUniqueText(text_pos)
#len(uniqueString)
firsthalf, secondhalf = uniqueString[:len(uniqueString)//2], uniqueString[len(uniqueString)//2:]
#firsthalf

length_string = len(string)
    first_length = round(length_string / 2)
    first_half = string[0:first_length].lower()
    second_half = string[first_length:].upper()
doc1 = nlp(uniqueString)

In [None]:
# we add some words to the stop word list
texts, article = [], []
for w in doc:
    # if it's not a stop word or punctuation mark, add it to our article!
    if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article.append(w.lemma_)
    # if it's a new line, it means we're onto our next document
    if w.text == '\n':
        texts.append(article)
        article = []

In [None]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]

In [None]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [None]:
lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

In [None]:
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

In [None]:
#Topic Coherence is a new gensim functionality where we can identify which topic model is 'better'. By returning a score, we can compare between different topic models of the same. 
# We use the same example from the news classification notebook to plot a graph between the topic models we have created.
lsitopicsImportance = [[prob for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

In [None]:
# TOPIC 1
plt.figure(figsize=(15,10))
plt.bar(lsitopics[0], lsitopicsImportance[0], align='center')
plt.xlabel('Words')
plt.ylabel('Importance')

In [None]:
# TOPIC 2
plt.figure(figsize=(15,10))
plt.bar(lsitopics[1], lsitopicsImportance[1], align='center')
plt.xlabel('Words')
plt.ylabel('Coherence Value')

In [None]:
# TOPIC 3
plt.figure(figsize=(15,10))
plt.bar(lsitopics[2], lsitopicsImportance[2], align='center')
plt.xlabel('Words')
plt.ylabel('Coherence Value')

In [None]:
# TOPIC 4
plt.figure(figsize=(15,10))
plt.bar(lsitopics[3], lsitopicsImportance[3], align='center')
plt.xlabel('Words')
plt.ylabel('Coherence Value')

In [None]:
# TOPIC 5
plt.figure(figsize=(15,10))
plt.bar(lsitopics[4], lsitopicsImportance[4], align='center')
plt.xlabel('Words')
plt.ylabel('Coherence Value')

## Glove

In [None]:
texts

In [None]:
file = open("glove_Textfile.txt","w")
for setence in texts:
    for word in setence:
        file.write(word+'\n')
file.close()

# get data from doc
get_data = read_corpus('glove_Textfile.txt')

corpus_model = Corpus()

corpus_model.fit(get_data, window=10)

epochs = 10
no_threads = 8

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs= epochs, no_threads=no_threads, verbose=True)
glove.add_dictionary(corpus_model.dictionary)

In [None]:
corpus_model.dictionary

In [None]:
glove.most_similar('safe')