In [None]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import operator
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('averaged_perceptron_tagger')

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk_stopwords= set(stopwords.words('english'))

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
df= pd.read_json('/content/drive/MyDrive/amazon_data_scrapping/Data files/Extension_Cord_Reviews.json', orient='index')
df.head()

In [3]:
df['reviewText']=df['reviewText'].str.encode('ascii', 'ignore').str.decode('ascii')
df['reviewText']= df['reviewText'].apply(lambda review: review.lower())
df['reviewTitle']= df['reviewTitle'].apply(lambda title: title.lower())
df['reviewText']= df['reviewText'].apply(lambda review: re.sub('\n', ' ', review))

df['reviewDate']= df['reviewDate'].apply(lambda date:re.sub('Reviewed in the United States on ',"", date).strip())
df['reviewRating']= df['reviewRating'].apply(lambda rating: re.sub(' out of 5 stars',"", rating).strip()) 
df['reviewRating']= df.reviewRating.astype('float')
df['reviewClass']=df.apply(lambda row: 1 if row.reviewRating>=4.0 else 0, axis=1)

df.head()

Unnamed: 0,reviewDate,reviewRating,reviewText,reviewTitle,reviewClass
0,"January 8, 2018",5.0,"now now now these are some awesome power cords. upon opening the package and feeling the build quality of these you know they are built to last. 1600+ watts mean even a fridge would do nothing to these. 13 amps can pass through these and a fridge can use 3-6 amps! the cable is flexible but has stiffness. i hooked on one (got 4 of these) a 55' 4k tv from tcl, a roku model, my gaming pc (i5 6500 and rx 470 in a 600 watts power supply) and an apple tv switching sometimes for a nintendo switch. this thing is awesome. the only thing that you can see is the grey cable piece and the beautiful squared angled plug in the wall. looks so well done and does not get in the way making the setup look ugly or something. looks well build, done and designed. will definitely buy more of this same one when needed.","looks well done, build and designed. awesome performance.",1
1,"March 24, 2019",4.0,"this cord gives plenty of room and the wire feels both strong and flexible. i didnt realize that the cord comes out to the right of the switch, which isnt a big deal, since you have a lot of room with the cord, but it is something to take not of if you will need all 8 feet for your purposes. also, it only has 3 outlets, which is fine, but if you are plugging in a bulky laptop cord, you have more like 2 outlets, unless your other 2 plugs are very small.",great product. wire comes out and to the right only.,1
2,"April 6, 2018",5.0,"it's funny how a little thing like a cloth wrap on the cord can make a product seem so much nicer and higher end. it does slide over itself and across shelf corners where a rubber/plastic cord would get stuck or tangled, so it's practical too. really, the flat, angled plug on this this is half of what makes it so excellent. i use it behind my turntable/record shelf, and it would probably not fit if it were a standard straight plug. i do really wish that this thing had a master on/off switch like bigger power bars do, but that's a pretty minor gripe. all in all, really nice product.",really very nice.,1
3,"June 12, 2018",5.0,"an extension cord with surge protection, what a terrific idea! no longer do you need to buy a huge surge protector, you can just get this and it gives you three convenient outlets. the low-profile prong is great when you want this plugged flat against the wall, and the braided cord is good at keeping this tangle-free. i think my only question is: how do we know this cord has done its job of protecting your devices from a surge? does it stop working? it would be great if the manufacturer could clarify. i got two of these and they are very handy around the house. recommended.",extension cord with surge protection,1
4,"May 1, 2018",5.0,"these cords are a must have, i don't have anything bad to say about them after having used them for at least a year now. i recently bought a second one for the bedroom. what i like: -well braided - no cheap feel. -firm plug-in, not too hard to plug in or remove power cords -the flat plug helps greatly if behind furniture. i use two of these by my bed and one goes to each side so that me and my girlfriend can plug in our phones, smart watches, tablets, etc. i leave the brick nearby and can pull it further onto the bed for those times you lay in bed all day and play on your tablet or laptop and need power. there are no con's to this item, i've had no issues for the year or so i've owned it and honestly that's how it should be with something as simple as an extension cord.","great quality, i bought another!",1


In [4]:
review_comments=''
# reviews=df[df['reviewClass']==0]['reviewText']
reviews= df['reviewText']
for review in reviews:
  review_comments= review_comments + ' '+ review
review_comments



In [5]:
sentences= sent_tokenize(review_comments)
print( len(sentences))
sentences[:5]

2212


[' now now now these are some awesome power cords.',
 'upon opening the package and feeling the build quality of these you know they are built to last.',
 '1600+ watts mean even a fridge would do nothing to these.',
 '13 amps can pass through these and a fridge can use 3-6 amps!',
 'the cable is flexible but has stiffness.']

In [6]:
import string
def remove_punctuations(text):
  regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
  nopunctuated_text = regex.sub(" ", str(text))
  return nopunctuated_text

In [7]:
# nltk.pos_tag(word_tokenize(sample_review))
# acceptable_tag_combos= [('JJ', 'NN'), ('JJ'), ('NN', 'IN', 'NN'), ('VBZ', 'JJ'), ('RB', 'RB', 'VBN'), ('RB', 'JJ')]

#accepted formats
accepted_unigram_combinations=[('JJ')]
accpeted_bigram_combinations= [('JJ', 'NN'), ('VBZ', 'JJ'),('RB', 'JJ')]
accpeted_trigram_combinations= [('NN', 'IN', 'NN'), ('RB', 'RB', 'VBN')]

In [8]:
#text cleaning
cleaned_sentences= [ remove_punctuations(sentence) for sentence in sentences]
review_comments_tokens= [word_tokenize(sentence) for sentence in cleaned_sentences]
review_comments_tokens= [token for token_list in review_comments_tokens for token in token_list]
review_comments_tokens[:5]

['now', 'now', 'now', 'these', 'are']

In [9]:
# bigram creation
bigram_finder= nltk.collocations.BigramCollocationFinder.from_words(review_comments_tokens)
df_bigram_frequency = pd.DataFrame(bigram_finder.ngram_fd.items(), columns=['keywords','frequency']).sort_values(by='frequency', ascending=False)
df_bigram_frequency.reset_index(inplace=True, drop=True)
# df_bigram_frequency.head()

In [11]:
def right_tags_bigrams(bigram):
  # first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
  # second_type = ('NN', 'NNS', 'NNP', 'NNPS')
  first_type= ('JJ', 'VBZ', 'RB' )
  second_type= ('NN', 'JJ', 'JJ','VB')
  tags = nltk.pos_tag(bigram)
  if 'i' in bigram or 'much' in bigram or 'is' in bigram:
    return False
  if tags[0][1] in first_type and tags[1][1] in second_type:
    return True
  else:
    return False
df_bigram_frequency['pos_tag']= df_bigram_frequency['keywords'].apply(lambda bigram: right_tags_bigrams(bigram))
df_bigram_frequency[df_bigram_frequency['pos_tag']== True][['keywords', 'frequency']][:10]

Unnamed: 0,keywords,frequency
19,"(good, quality)",52
22,"(flat, plug)",50
60,"(high, quality)",32
62,"(great, product)",32
74,"(very, nice)",28
75,"(great, extension)",28
95,"(great, quality)",23
97,"(low, profile)",22
118,"(good, product)",20
138,"(looks, great)",18


In [13]:
# trigram creation
trigram_finder= nltk.collocations.TrigramCollocationFinder.from_words(review_comments_tokens)
df_trigram_frequency = pd.DataFrame(trigram_finder.ngram_fd.items(), columns=['keywords','frequency']).sort_values(by='frequency', ascending=False)
df_trigram_frequency.reset_index(inplace=True, drop=True)
# df_trigram_frequency.head()

In [14]:
def right_tags_trigram(trigram):
  first_type= ('NN', 'RB','VB', 'JJ')
  second_type= ('IN', 'RB', 'DT', 'RB', 'JJ')
  third_type= ('NN', 'VBN','NN', 'RB', 'IN')
  tags = nltk.pos_tag(trigram)
  if 'i' in trigram or 'so' in trigram or 'is' in trigram or 'the' in trigram:
    return False
  if tags[0][1] in first_type and tags[1][1] in second_type and tags[2][1] in third_type:
    return True
  else:
    return False
df_trigram_frequency['pos_tag']= df_trigram_frequency['keywords'].apply(lambda trigram: right_tags_trigram(trigram))
# df_trigram_frequency[df_trigram_frequency['pos_tag']== True][:10]

In [15]:
trigrams=df_trigram_frequency[df_trigram_frequency['pos_tag']== True][['keywords', 'frequency']][:10]
bigrams=df_bigram_frequency[df_bigram_frequency['pos_tag']== True][['keywords', 'frequency']][:10]
keywords= bigrams.append(trigrams).reset_index(drop=True)

In [19]:
keywords

Unnamed: 0,keywords,frequency
0,"(good, quality)",52
1,"(flat, plug)",50
2,"(high, quality)",32
3,"(great, product)",32
4,"(very, nice)",28
5,"(great, extension)",28
6,"(great, quality)",23
7,"(low, profile)",22
8,"(good, product)",20
9,"(looks, great)",18


In [None]:
bigrams = nltk.collocations.BigramAssocMeasures()
bigram_likelihood_ratio = pd.DataFrame(list(bigram_finder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)
bigram_likelihood_ratio[:10]

In [None]:
def right_tags(bigram):
  # first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
  # second_type = ('NN', 'NNS', 'NNP', 'NNPS')
  first_type= ('JJ', 'VBZ', 'RB')
  second_type= ('NN', 'JJ', 'JJ')
  tags = nltk.pos_tag(bigram)
  if tags[0][1] in first_type and tags[1][1] in second_type:
    return True
  else:
    return False
filtered_likelihood_for_bigram = bigram_likelihood_ratio[bigram_likelihood_ratio.bigram.map(lambda x: right_tags(x))]
filtered_likelihood_for_bigram[:10]

In [None]:
trigrams = nltk.collocations.TrigramAssocMeasures()
trigram_likelihood_ratio = pd.DataFrame(list(trigram_finder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)
trigram_likelihood_ratio[:10]

In [None]:
def right_tags_trigram(trigram):
  first_type= ('NN', 'RB')
  second_type= ('IN', 'RB')
  third_type= ('NN', 'VBN')
  tags = nltk.pos_tag(trigram)
  if tags[0][1] in first_type and tags[1][1] in second_type and tags[2][1] in third_type:
    return True
  else:
    return False
filtered_likelihood_for_trigram = trigram_likelihood_ratio[trigram_likelihood_ratio.trigram.map(lambda x: right_tags_trigram(x))]
filtered_likelihood_for_trigram[:10]

Unnamed: 0,trigram,likelihood ratio
878,"(waste, of, money)",794.970356
924,"(pot, of, coffee)",770.743873
1066,"(cup, of, coffee)",697.311304
1160,"(bottom, of, coffee)",651.441234
1701,"(lot, of, coffee)",568.89679
1827,"(kind, of, coffee)",539.243005
1943,"(instead, of, coffee)",509.208066
2086,"(amount, of, coffee)",493.225021
2272,"(ton, of, coffee)",470.190093
2598,"(brand, of, coffee)",433.516903


In [None]:
def remove_stopwords(sentence):
  new_sentence = " ".join([word for word in sentence if word not in nltk_stopwords])
  return new_sentence
cleaned_sentences = [remove_stopwords(sentence.split()) for sentence in sentences]

In [None]:

cleaned_sentences= [ lemmatize(sentence) for sentence in cleaned_sentences]

In [None]:
cleaned_sentences= [token for token_list in cleaned_sentences for token in token_list]
cleaned_sentences[:10]

['last',
 'coffee',
 'maker',
 'cheapest',
 'one',
 'saw',
 'store.',
 'lasted',
 'almost',
 '5']

In [None]:
bigram_finder= nltk.collocations.BigramCollocationFinder.from_words(cleaned_sentences)
dictionary_bigram_frequency= bigram_finder.ngram_fd
dictionary_bigram_frequency

In [None]:
df_bigram_frequency = pd.DataFrame(dictionary_bigram_frequency.items(), columns=['bigram','frequency']).sort_values(by='frequency', ascending=False)
df_bigram_frequency.reset_index(inplace=True, drop=True)
df_bigram_frequency.head()

Unnamed: 0,bigram,frequency
0,"(knife, block)",48
1,"(knife, holder)",21
2,"(hold, knife)",18
3,"(look, great)",13
4,"(counter, space)",12


In [None]:
sample_review='This product is good quality, well-made and available at a good price! Lovely product. Amazon provided a strong good packaging and a value for money product. High quality and it looks expensive too, very well made, very stylish, very sturdy'
sample_review= sample_review.lower()
sample_review_comments=''
for review in sample_review:
  sample_review_comments= sample_review_comments + ''+ review
sample_review_sentences= sent_tokenize(sample_review_comments)
# cleaned_sample_review = [remove_stopwords(sentence.split()) for sentence in sample_review_sentences]
cleaned_sample_review= [ remove_punctuations (sentence)for sentence in cleaned_sample_review]
cleaned_sample_review= [ lemmatize(sentence) for sentence in cleaned_sample_review]
cleaned_sentences= [token for token_list in cleaned_sentences for token in token_list]

In [None]:
df_bigram_frequency[df_bigram_frequency['accepted_bigrams']==1]

Unnamed: 0,bigram,frequency,pos_tag,accepted_bigrams


In [None]:
#change to camelcase :Done
#apply spell check
#clean up extra spaces and periods
