In [107]:
import pandas as pd
import numpy as np
import spacy 
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.classify import NaiveBayesClassifier
import text_processor as tp
import pickle

In [124]:
sp = spacy.load('en', parse=True, tag=True, entity=True)

In [10]:
def lemmatize_text(text):
    """
    Lemmatize the words using spacy
    Note: Pronouns are excluded (eg: he, she, my, it etc...)
    """
    try:
        text = sp(text)
        text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    except TypeError:
        pass
    return text

In [30]:
text = "Everything is great except that i can't read external pdfs properly. Either it zooms too much of too little. For kindle books it provides great experience. Battery life is awesome!"

In [31]:
doc = sp(tp.clean_text(text))

In [32]:
for token in doc:
    print(token, token.pos_, token.lemma_)

everything PRON everything
great ADJ great
except SCONJ except
read VERB read
external ADJ external
pdfs NOUN pdfs
properly ADV properly
either CCONJ either
zoom VERB zoom
much ADJ much
little ADJ little
kindle NOUN kindle
book NOUN book
provide VERB provide
great ADJ great
experience NOUN experience
battery NOUN battery
life NOUN life
awesome ADJ awesome


In [33]:
for token in doc:
    if token.pos_ == 'NOUN' or token.pos_ == 'ADJ':
        print(token, token.pos_)

great ADJ
external ADJ
pdfs NOUN
much ADJ
little ADJ
kindle NOUN
book NOUN
great ADJ
experience NOUN
battery NOUN
life NOUN
awesome ADJ


In [2]:
df = pd.read_csv('datasets/AllProductReviews.csv')

In [4]:
df.count()

ReviewTitle    14337
ReviewBody     14337
ReviewStar     14337
Product        14337
dtype: int64

In [5]:
df.groupby(df['ReviewStar']).count()

Unnamed: 0_level_0,ReviewTitle,ReviewBody,Product
ReviewStar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2493,2493,2493
2,939,939,939
3,1503,1503,1503
4,3189,3189,3189
5,6213,6213,6213


In [6]:
df1 = df[df['ReviewStar'] < 3]

In [7]:
df1.head()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
1,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
3,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
5,Charging port not working\n,"After 11 days, the charging port isn't working...",1,boAt Rockerz 255
9,Very very bad Durabity\n,The product durability is 1 month..I just lost...,1,boAt Rockerz 255
11,Disappointed\n,What on earth is the use of buying such produc...,1,boAt Rockerz 255


In [12]:
df1 = df1.drop(['ReviewTitle', 'Product'], axis=1)

In [13]:
df1.head()

Unnamed: 0,ReviewBody,ReviewStar
1,"This earphones are unreliable, i bought it be...",1
3,Its sound quality is adorable. overall it was ...,1
5,"After 11 days, the charging port isn't working...",1
9,The product durability is 1 month..I just lost...,1
11,What on earth is the use of buying such produc...,1


In [14]:
df1 = df1.rename(columns={'ReviewBody':'content','ReviewStar':'rating'})

In [15]:
df1.head()

Unnamed: 0,content,rating
1,"This earphones are unreliable, i bought it be...",1
3,Its sound quality is adorable. overall it was ...,1
5,"After 11 days, the charging port isn't working...",1
9,The product durability is 1 month..I just lost...,1
11,What on earth is the use of buying such produc...,1


In [16]:
df1.count()

content    3432
rating     3432
dtype: int64

In [6]:
with open('pickles/amazon/tfidf.pickle', 'rb') as data:
    tfidf = pickle.load(data)

with open('pickles/amazon/mnb_classifier.pickle', 'rb') as data:
    mnb_model = pickle.load(data)

In [117]:
text = "stopped working after 2 months"

In [156]:
text = "I love the color"

In [70]:
text = "it would not load my books proper. took a dozen tries erasing an dreregistering. screen too dark"

In [262]:
text = "Everything is great except that i can't read external pdfs properly. Either it zooms too much of too little. For kindle books it provides great experience. Battery life is awesome!"

In [253]:
cleaned_text = tp.clean_text(text)

In [254]:
tfidf_vector = tfidf.transform([cleaned_text])

In [255]:
category_id = mnb_model.predict(tfidf_vector)
print(category_id)

[1.]


In [256]:
feature_array = np.array(tfidf.get_feature_names())

In [257]:
tfidf_sorting = np.argsort(tfidf_vector.toarray()).flatten()[::-1]

In [258]:
n=4

In [259]:
top_n = feature_array[tfidf_sorting][:n]

In [260]:
print(top_n)

['though' 'fast' 'turn' 'show']


In [181]:
def find_top(text, tfidf_vector):
    feature_array = np.array(tfidf.get_feature_names())
    tfidf_sorting = np.argsort(tfidf_vector.toarray()).flatten()[::-1]
    n=5
    top_n = feature_array[tfidf_sorting][:n]
    

In [233]:
def get_pos_tags(text):
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)    
    pos_tags = nltk.pos_tag(tokens)
    nouns = []
    adjectives = []
    verbs = []
    for word, tag in pos_tags:
        if tag == "NN" or tag == "NNP" or tag == "NNS":
            nouns.append(word)
        elif tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            verbs.append(word)
        elif tag == "JJ" or tag == "JJR" or tag == "JJS" or tag == "RB" or tag == "RBR" or tag == "RBS":
            adjectives.append(word)
    return nouns, adjectives, verbs
    

In [234]:
text = "Item stopped working after 2 months"

In [264]:
nouns, adjectives, verbs = get_pos_tags(text)

In [265]:
print(nouns)
print(adjectives)
print(verbs)

['Everything', 'i', 'pdfs', 'properly.', 'Either', 'books', 'Battery', 'life']
['great', 'read', 'external', 'too', 'much', 'too', 'little.', 'kindle', 'great', 'experience.', 'awesome']
['is', 't', 'zooms', 'provides', 'is']


In [268]:
print(f'Cause of positivity: {nouns[0]}') 
print(f'Positive word: {adjectives[0] if adjectives else verbs[0]}')

Cause of positivity: Everything
Positive word: great


In [204]:
tops = find_top(text, tfidf_vector)

In [205]:

['item' 'stop work' 'stop' 'month']

['itemstop workstopmonth']

In [263]:
text = "Everything is great except that i can't read external pdfs properly. Either it zooms too much of too little. For kindle books it provides great experience. Battery life is awesome!"

In [248]:
text = "It's beyond my expectation, and it can even show music score. Not fast turning though."

In [238]:
text = "the stocks fell quickly"

In [245]:
text = "very good color"

In [147]:
tokenizer = ToktokTokenizer()

In [229]:
tokens = tokenizer.tokenize(text)

In [230]:
print(tokens)

['the', 'stocks', 'fell', 'quickly']


In [231]:
pos_tags = nltk.pos_tag(tokens)

In [232]:
print(pos_tags)

[('the', 'DT'), ('stocks', 'NNS'), ('fell', 'VBD'), ('quickly', 'RB')]


In [119]:
def pos_tagging(text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word,tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word + '_' + tag)
    return pos_tagged_noun_verb

In [152]:
pos_tagging(text)

['love_VBP', 'color_NN']

In [153]:
text = sp(text)

In [154]:
for word in text:
    print(word,word.lemma_, word.pos_, word.tag_)
    print()

I -PRON- PRON PRP

love love VERB VBP

the the DET DT

color color NOUN NN



In [144]:
text = ' '.join([(word.lemma_+'_'+word.tag_) if word.lemma_ != '-PRON-' else (word.text+'_'+word.tag_) for word in text])

In [145]:
print(text)

I_PRP love_VBP the_DT color_NN
