### Loading of datasets

##### 1st set

In [1]:
from os import listdir
from os.path import isfile, join

print("Retrieving file names of product reviews...")

reviews_files = ['product_reviews/' + f for f in listdir('product_reviews/') if isfile(join('product_reviews/', f))]
num_files = len(reviews_files)

print("Finished...")
print("Total files: ", num_files)

Retrieving file names of product reviews...
Finished...
Total files:  25000


In [2]:
def getTextFromFiles(file_list):
    reviews = []
    for file in file_list:
        with open(file, "r") as f:
            text = f.read()
            reviews.append(text) 
    return reviews

In [3]:
print("Loading all reviews from files...")
reviews_dataset = getTextFromFiles(reviews_files)
print("Retrieved all text reviews from ", num_files, "files")

Loading all reviews from files...
Retrieved all text reviews from  25000 files


In [4]:
#print out sample reviews from the datasets
counter = 0
for r in reviews_dataset:
    print(r + "\n" * 1)
    counter += 1
    if counter == 5:
        break

This is a great tutu and at a really great price. It doesn't look cheap at all. I'm so glad I looked on Amazon and found such an affordable tutu that isn't made poorly. A++

I bought this for my 4 yr old daughter for dance class, she wore it today for the first time and the teacher thought it was adorable. I bought this to go with a light blue long sleeve leotard and was happy the colors matched up great. Price was very good too since some of these go for over $15.00 dollars.

Wonder my niece wears it every single day, yellow is her favorite color right now an this cute little tutu made he da. It is well built and we hope she gets lots of wear out of it.

It might just be me.  Although it seems well made and sized right.  It just seems a bit flimsy to me.  I debate if I really should not give it 5 stars as it does everything it advertises itself.  When I travel out of country, I have this around my neck and under one layer of clothing.  I have my &#34;throw away&#34; wallet in my pocke

#### Cleaning the texts

In [5]:
#maximum of 100 per review
def get_reviews(review, batch):
    return " ".join(review.split()[:batch]) 


from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
num_of_words = 150

def clean(doc):
    doc = get_reviews(doc, num_of_words)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized 

In [6]:
print("Cleaning all reviews..\n")
#cleaning the documents
#doc_clean = [clean(doc).split() for doc in sample]

cleaned = []
counter = 0
for doc in reviews_dataset:
    cleaned.append(clean(doc).split())
    if(counter % 5000 == 0 and counter != 0):
        print("Cleaned", counter, "reviews..")
    counter += 1
        
print("\nCleaning process finished.")
    


Cleaning all reviews..

Cleaned 5000 reviews..
Cleaned 10000 reviews..
Cleaned 15000 reviews..
Cleaned 20000 reviews..

Cleaning process finished.


In [4]:
# Importing Genism
import gensim
from gensim import corpora



In [9]:
print("Converting text reviews to it's vector representation..")

# Creating the term dictionary of our corpus, wheere every unique term is assigned an index
dictionary = corpora.Dictionary(cleaned)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]

print("Finished..")

Converting text reviews to it's vector representation..
Finished..


### Network Building

In [5]:
# Creatng the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [33]:
print("Training the model..")
# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=15, id2word = dictionary, passes=100)

print("LDA model is successfully trained.")

Training the model..
LDA model is successfully trained.


In [9]:
#saving the trained model to disk
#save model
from gensim.test.utils import datapath
temp_file = datapath("model")

In [None]:
# Save model to disk.
ldamodel.save(temp_file)

In [10]:
# Load a potentially pretrained model from disk.
ldaModel = Lda.load(temp_file)

In [11]:
for idx, topic in ldaModel.print_topics(num_topics=15, num_words=8):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.057*"fit" + 0.035*"good" + 0.034*"great" + 0.030*"well" + 0.023*"look" + 0.022*"like" + 0.021*"pant" + 0.020*"wear"
Topic: 1 
Words: 0.024*"glass" + 0.016*"mom" + 0.015*"case" + 0.013*"lens" + 0.012*"hose" + 0.008*"suspender" + 0.007*"ugly" + 0.007*"drive"
Topic: 2 
Words: 0.040*"jean" + 0.032*"pair" + 0.026*"year" + 0.022*"color" + 0.019*"fit" + 0.015*"levi" + 0.012*"one" + 0.012*"bought"
Topic: 3 
Words: 0.011*"insole" + 0.011*"501" + 0.010*"immediately" + 0.010*"polish" + 0.008*"e" + 0.008*"section" + 0.008*"cushioned" + 0.007*"spirit"
Topic: 4 
Words: 0.015*"like" + 0.015*"one" + 0.012*"get" + 0.009*"would" + 0.009*"im" + 0.008*"look" + 0.008*"time" + 0.007*"really"
Topic: 5 
Words: 0.035*"bag" + 0.033*"pocket" + 0.016*"use" + 0.014*"wallet" + 0.013*"one" + 0.012*"hold" + 0.010*"carry" + 0.009*"strap"
Topic: 6 
Words: 0.040*"love" + 0.024*"great" + 0.020*"bought" + 0.019*"look" + 0.018*"color" + 0.017*"it" + 0.014*"them" + 0.014*"like"
Topic: 7 
Words: 0.093*"bra

### Testing the model

In [102]:
def returnMaxPrediction(predictions):
    topic_id = 0
    prediction = 0.00
    for p in predictions:
        id, p_weight = p
        if(p_weight > prediction):
            topic_id = id
            prediction = p_weight
    return topic_id, prediction

In [136]:
def predictTopics(text):
    
    test_doc_complete = [text]

    #cleaning the documents
    test_doc_clean = [clean(doc).split() for doc in test_doc_complete]

    # Creating the term dictionary of our corpus, wheere every unique term is assigned an index
    test_dictionary = corpora.Dictionary(test_doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above
    test_doc_term_matrix = [test_dictionary.doc2bow(doc) for doc in test_doc_clean]
    
    for index, score in sorted(ldaModel[test_doc_term_matrix[0]], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, ldaModel.print_topic(index, 10)))    

In [1]:
input = "This is a great tutu and at a really great price. It doesn't look cheap at all. I'm so glad I looked on Amazon and found such an affordable tutu that isn't made poorly. A++"
predictTopics(input)

NameError: name 'predictTopics' is not defined