# Page for trying out the data

In [1]:
import pandas as pd
import nltk
import re
import tensorflow as tf
import tensorflow_hub as hub

  from ._conv import register_converters as _register_converters
W0405 10:18:47.189100 139972654188288 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
df1 = pd.read_csv('amazon_reviews_us_Mobile_Electronics_v1_00.tsv',sep="\t", error_bad_lines=False)
# bad lines exist......

b'Skipping line 35246: expected 15 fields, saw 22\n'
b'Skipping line 87073: expected 15 fields, saw 22\n'


### DATA COLUMNS:
marketplace       - 2 letter country code of the marketplace where the review was written.  
customer_id       - Random identifier that can be used to aggregate reviews written by a single author.  
review_id         - The unique ID of the review.  
product_id        - The unique Product ID the review pertains to. In the multilingual dataset the reviews
                    for the same product in different countries can be grouped by the same product_id.  
product_parent    - Random identifier that can be used to aggregate reviews for the same product.  
product_title     - Title of the product.  
product_category  - Broad product category that can be used to group reviews  
                    (also used to group the dataset into coherent parts).  
star_rating       - The 1-5 star rating of the review.  
helpful_votes     - Number of helpful votes.  
total_votes       - Number of total votes the review received.  
vine              - Review was written as part of the Vine program.  
verified_purchase - The review is on a verified purchase.  
review_headline   - The title of the review.  
review_body       - The review text.  
review_date       - The date the review was written.  

Filter out the ones not versified and get a subset of the data

In [3]:
df1 = df1.loc[df1['verified_purchase']=='Y',['review_id', 'product_id', 'product_title', 'helpful_votes','review_headline', 'review_body']]
df1['product_id'].nunique()

22299

Get a list of products that are with at least 200 reviews

In [4]:
count_df = df1.groupby('product_id').count()
count_df = count_df['review_id']

count_df.mean()
count_df.max()

count_df = count_df.loc[lambda x: x>=200]
count_df = count_df.sort_values(ascending=False)

product_list = count_df.index.values.tolist()

In [5]:
df1 = df1[df1['product_id'].isin(product_list)]

df1[0:5]

Unnamed: 0,review_id,product_id,product_title,helpful_votes,review_headline,review_body
2,R2Y0MM9YE6OP3P,B00QERR5CY,iXCC Multi pack Lightning cable,0.0,great cables,These work great and fit my life proof case fo...
4,R26I2RI1GFV8QG,B0067XVNTG,Generic Car Dashboard Video Camera Vehicle Vid...,0.0,Cameras has battery issues,"Be careful with these products, I have bought ..."
48,R2WGDZBMIMZ1HK,B00LAG4HN4,"iXCC Element II Lightning Cable 6ft, iPhone Ch...",0.0,"Good, strong, and 6 feet long!","Good, strong, and 6 feet long."
60,RRPOCULNRBGQ,B00LAG4HN4,"iXCC Element II Lightning Cable 6ft, iPhone Ch...",0.0,made with excellent materials at the joints be...,Apple makes their charging products with infer...
77,R2K2WK38XR5FKZ,B00QERR5CY,iXCC Multi pack Lightning cable,0.0,One Star,Two failed


Test on one product. Get the reviews for one product

In [6]:
reviews = df1[df1['product_id']==product_list[0]]#.drop(columns=['review_id','product_title','product_id'])
reviews = reviews['review_body']

reviews = reviews.str.replace('<br />','')

def remove_consecutive(text):
    one = re.sub(r"([eoEO])\1\1+",r"\1\1",text)
    return re.sub(r"([^eoEO])\1\1+",r"\1",one)

reviews = pd.Series(list(map(remove_consecutive,reviews)))

In [7]:
reviews = reviews.str.split('.',expand=True).stack().reset_index()

In [8]:
reviews['word_count'] = reviews[0].str.split().apply(len)#.strip().apply(len)
reviews[0:10]

Unnamed: 0,level_0,level_1,0,word_count
0,0,0,Very good quality,3
1,0,1,So far so good,4
2,0,2,,0
3,1,0,Good product and good seller,5
4,2,0,Great product!,2
5,3,0,They charge my wife's phone,5
6,4,0,Works like a charm!,4
7,5,0,"I've ordered a ton of these white and black, l...",12
8,5,1,I keep buying them because they are made so ...,10
9,5,2,"I need more to buy for the office, my car, t...",12


In [9]:
reviews = reviews.loc[reviews['word_count']>0,[0]]

In [10]:
reviews['text'] = reviews[0]
reviews = reviews.drop(columns=0)

In [11]:
reviews['POS'] = reviews['text'].apply(nltk.word_tokenize).apply(nltk.pos_tag)
reviews[0:5]

Unnamed: 0,text,POS
0,Very good quality,"[(Very, RB), (good, JJ), (quality, NN)]"
1,So far so good,"[(So, RB), (far, RB), (so, RB), (good, JJ)]"
3,Good product and good seller,"[(Good, JJ), (product, NN), (and, CC), (good, ..."
4,Great product!,"[(Great, JJ), (product, NN), (!, .)]"
5,They charge my wife's phone,"[(They, PRP), (charge, VBP), (my, PRP$), (wife..."


In [12]:
def getMainWords(pos):
    result = '';
    for x in pos:
        if x[1][0:2] in ['NN','VB']:
            result= result+x[0]+' '
    if len(result)>0:
        result= result[0:len(result)-1]
    return result

reviews['main'] = list(map(getMainWords,reviews['POS']))

reviews[0:5]

Unnamed: 0,text,POS,main
0,Very good quality,"[(Very, RB), (good, JJ), (quality, NN)]",quality
1,So far so good,"[(So, RB), (far, RB), (so, RB), (good, JJ)]",
3,Good product and good seller,"[(Good, JJ), (product, NN), (and, CC), (good, ...",product seller
4,Great product!,"[(Great, JJ), (product, NN), (!, .)]",product
5,They charge my wife's phone,"[(They, PRP), (charge, VBP), (my, PRP$), (wife...",charge wife phone


In [13]:
reviews = reviews.loc[reviews['main'].apply(len)>0,['text','main']]

In [14]:
reviews[0:20]

Unnamed: 0,text,main
0,Very good quality,quality
3,Good product and good seller,product seller
4,Great product!,product
5,They charge my wife's phone,charge wife phone
6,Works like a charm!,Works charm
7,"I've ordered a ton of these white and black, l...",'ve ordered ton
8,I keep buying them because they are made so ...,keep buying are made
9,"I need more to buy for the office, my car, t...",need buy office car house
10,etc,etc
11,awesome stuff!,stuff


Word Embedding / Sentence Embedding

In [16]:
# Load google Universal Sentence Encoder
module_dir = "downloads/encoder"
embed = hub.Module(module_dir)

Instructions for updating:
Colocations handled automatically by placer.


W0405 10:20:18.091065 139972654188288 deprecation.py:323] From /home/betty35/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [20]:
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  sentence_embedding= session.run(embed(list(reviews['text'])))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0405 10:27:43.774258 139972654188288 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [21]:
sentence_embedding[0:5]

array([[ 0.06433529, -0.05392856,  0.02364377, ..., -0.03504702,
        -0.00971428, -0.04463368],
       [ 0.02487145, -0.01953173,  0.00984184, ..., -0.01322363,
        -0.03477967,  0.05130831],
       [ 0.03877192, -0.03623104,  0.01428011, ..., -0.02759716,
        -0.00387881,  0.04328242],
       [ 0.02674303, -0.06447621, -0.02215756, ...,  0.03415874,
        -0.04212781, -0.00147286],
       [-0.00213155,  0.02355909,  0.05261142, ..., -0.01222009,
         0.04687894, -0.00129713]], dtype=float32)

In [24]:
reviews.size

4584

In [28]:
len(sentence_embedding)

2292

In [31]:
len(sentence_embedding[0])

512

In [33]:
#sentence embedding
reviews['s_emb'] = pd.Series(list(sentence_embedding))

In [34]:
reviews[0:10]

Unnamed: 0,text,main,s_emb
0,Very good quality,quality,"[0.064335294, -0.05392856, 0.023643766, 0.0281..."
3,Good product and good seller,product seller,"[0.026743034, -0.064476214, -0.022157555, 0.06..."
4,Great product!,product,"[-0.0021315496, 0.023559086, 0.05261142, -0.03..."
5,They charge my wife's phone,charge wife phone,"[0.033525135, -0.031799648, -0.012317592, 0.01..."
6,Works like a charm!,Works charm,"[0.01742137, -0.02121579, 0.021088673, 0.01316..."
7,"I've ordered a ton of these white and black, l...",'ve ordered ton,"[0.009239573, -0.05190375, -0.009112191, -0.02..."
8,I keep buying them because they are made so ...,keep buying are made,"[0.03568201, -0.008994566, 0.045553826, -0.023..."
9,"I need more to buy for the office, my car, t...",need buy office car house,"[0.021971293, -0.05539746, 0.012881672, 0.0043..."
10,etc,etc,"[0.024454864, -0.030930609, 0.010781822, 0.052..."
11,awesome stuff!,stuff,"[0.04618356, -0.0048306207, 0.011567681, 0.066..."
