# Page for trying out the data

In [1]:
import pandas as pd
import time
import nltk
import re
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np  
from sklearn.cluster import KMeans  

  from ._conv import register_converters as _register_converters
W0409 17:16:06.296614 139967205250816 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
df1 = pd.read_csv('amazon_reviews_us_Mobile_Electronics_v1_00.tsv',sep="\t", error_bad_lines=False)
# bad lines exist......

b'Skipping line 35246: expected 15 fields, saw 22\n'
b'Skipping line 87073: expected 15 fields, saw 22\n'


### DATA COLUMNS:
marketplace       - 2 letter country code of the marketplace where the review was written.  
customer_id       - Random identifier that can be used to aggregate reviews written by a single author.  
review_id         - The unique ID of the review.  
product_id        - The unique Product ID the review pertains to. In the multilingual dataset the reviews
                    for the same product in different countries can be grouped by the same product_id.  
product_parent    - Random identifier that can be used to aggregate reviews for the same product.  
product_title     - Title of the product.  
product_category  - Broad product category that can be used to group reviews  
                    (also used to group the dataset into coherent parts).  
star_rating       - The 1-5 star rating of the review.  
helpful_votes     - Number of helpful votes.  
total_votes       - Number of total votes the review received.  
vine              - Review was written as part of the Vine program.  
verified_purchase - The review is on a verified purchase.  
review_headline   - The title of the review.  
review_body       - The review text.  
review_date       - The date the review was written.  

Filter out the ones not versified and get a subset of the data

In [3]:
df1 = df1.loc[df1['verified_purchase']=='Y',['review_id', 'product_id', 'product_title', 'helpful_votes','review_headline', 'review_body']]
df1['product_id'].nunique()

22299

Get a list of products that are with at least 200 reviews

In [4]:
count_df = df1.groupby('product_id').count()
count_df = count_df['review_id']

print('mean count:', count_df.mean(),'; max count:',count_df.max())

count_df = count_df.loc[lambda x: x>=200]
count_df = count_df.sort_values(ascending=False)

product_list = count_df.index.values.tolist()

mean count: 3.964034261626082 ; max count: 1044


In [5]:
df1 = df1[df1['product_id'].isin(product_list)]

df1[0:5]

Unnamed: 0,review_id,product_id,product_title,helpful_votes,review_headline,review_body
2,R2Y0MM9YE6OP3P,B00QERR5CY,iXCC Multi pack Lightning cable,0.0,great cables,These work great and fit my life proof case fo...
4,R26I2RI1GFV8QG,B0067XVNTG,Generic Car Dashboard Video Camera Vehicle Vid...,0.0,Cameras has battery issues,"Be careful with these products, I have bought ..."
48,R2WGDZBMIMZ1HK,B00LAG4HN4,"iXCC Element II Lightning Cable 6ft, iPhone Ch...",0.0,"Good, strong, and 6 feet long!","Good, strong, and 6 feet long."
60,RRPOCULNRBGQ,B00LAG4HN4,"iXCC Element II Lightning Cable 6ft, iPhone Ch...",0.0,made with excellent materials at the joints be...,Apple makes their charging products with infer...
77,R2K2WK38XR5FKZ,B00QERR5CY,iXCC Multi pack Lightning cable,0.0,One Star,Two failed


Test on one product. Get the reviews for one product

In [6]:
reviews = df1[df1['product_id']==product_list[0]]#.drop(columns=['review_id','product_title','product_id'])
reviews = reviews['review_body']

reviews = reviews.str.replace('<br />','')

def remove_consecutive(text):
    one = re.sub(r"([eoEO])\1\1+",r"\1\1",text)
    return re.sub(r"([^eoEO])\1\1+",r"\1",one)

reviews = pd.Series(list(map(remove_consecutive,reviews)))

In [7]:
reviews = reviews.str.split('.',expand=True).stack().reset_index()

In [8]:
reviews['word_count'] = reviews[0].str.split().apply(len)#.strip().apply(len)
reviews[0:10]

Unnamed: 0,level_0,level_1,0,word_count
0,0,0,Very good quality,3
1,0,1,So far so good,4
2,0,2,,0
3,1,0,Good product and good seller,5
4,2,0,Great product!,2
5,3,0,They charge my wife's phone,5
6,4,0,Works like a charm!,4
7,5,0,"I've ordered a ton of these white and black, l...",12
8,5,1,I keep buying them because they are made so ...,10
9,5,2,"I need more to buy for the office, my car, t...",12


In [9]:
reviews = reviews.loc[reviews['word_count']>3,[0]]

In [10]:
reviews['text'] = reviews[0]
reviews = reviews.drop(columns=0)

In [11]:
reviews

Unnamed: 0,text
1,So far so good
3,Good product and good seller
5,They charge my wife's phone
6,Works like a charm!
7,"I've ordered a ton of these white and black, l..."
8,I keep buying them because they are made so ...
9,"I need more to buy for the office, my car, t..."
12,I've bought a total of 3 of these cables I've ...
13,Lasted longer than the ones I got from Apple
14,I needed one that worked with my lifeproof ca...


In [12]:
reviews['POS'] = reviews['text'].apply(nltk.word_tokenize).apply(nltk.pos_tag)
reviews[0:5]

Unnamed: 0,text,POS
1,So far so good,"[(So, RB), (far, RB), (so, RB), (good, JJ)]"
3,Good product and good seller,"[(Good, JJ), (product, NN), (and, CC), (good, ..."
5,They charge my wife's phone,"[(They, PRP), (charge, VBP), (my, PRP$), (wife..."
6,Works like a charm!,"[(Works, NNS), (like, IN), (a, DT), (charm, NN..."
7,"I've ordered a ton of these white and black, l...","[(I, PRP), ('ve, VBP), (ordered, VBN), (a, DT)..."


In [13]:
def getMainWords(pos):
    result = '';
    for x in pos:
        if x[1][0:2] in ['NN','VB']:
            result= result+x[0]+' '
    if len(result)>0:
        result= result[0:len(result)-1]
    return result

reviews['main'] = list(map(getMainWords,reviews['POS']))

reviews[0:5]

Unnamed: 0,text,POS,main
1,So far so good,"[(So, RB), (far, RB), (so, RB), (good, JJ)]",
3,Good product and good seller,"[(Good, JJ), (product, NN), (and, CC), (good, ...",product seller
5,They charge my wife's phone,"[(They, PRP), (charge, VBP), (my, PRP$), (wife...",charge wife phone
6,Works like a charm!,"[(Works, NNS), (like, IN), (a, DT), (charm, NN...",Works charm
7,"I've ordered a ton of these white and black, l...","[(I, PRP), ('ve, VBP), (ordered, VBN), (a, DT)...",'ve ordered ton


In [14]:
reviews = reviews.loc[reviews['main'].apply(len)>0,['text','main']]

In [15]:
reviews

Unnamed: 0,text,main
3,Good product and good seller,product seller
5,They charge my wife's phone,charge wife phone
6,Works like a charm!,Works charm
7,"I've ordered a ton of these white and black, l...",'ve ordered ton
8,I keep buying them because they are made so ...,keep buying are made
9,"I need more to buy for the office, my car, t...",need buy office car house
12,I've bought a total of 3 of these cables I've ...,'ve bought total cables 've had month works
13,Lasted longer than the ones I got from Apple,Lasted ones got Apple
14,I needed one that worked with my lifeproof ca...,needed one worked case does
15,This cable works lightening-fast,cable works lightening-fast


Word Embedding / Sentence Embedding

In [16]:
df2 = reviews.reset_index(drop=True)
df2

Unnamed: 0,text,main
0,Good product and good seller,product seller
1,They charge my wife's phone,charge wife phone
2,Works like a charm!,Works charm
3,"I've ordered a ton of these white and black, l...",'ve ordered ton
4,I keep buying them because they are made so ...,keep buying are made
5,"I need more to buy for the office, my car, t...",need buy office car house
6,I've bought a total of 3 of these cables I've ...,'ve bought total cables 've had month works
7,Lasted longer than the ones I got from Apple,Lasted ones got Apple
8,I needed one that worked with my lifeproof ca...,needed one worked case does
9,This cable works lightening-fast,cable works lightening-fast


In [17]:
reviews = df2

In [18]:
# Load google Universal Sentence Encoder
module_dir = "downloads/encoder"
embed = hub.Module(module_dir)

Instructions for updating:
Colocations handled automatically by placer.


W0409 17:17:06.682441 139967205250816 deprecation.py:323] From /home/betty35/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [19]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    start_time=time.time()
    reviews['sentence_embedding'] = pd.Series(list(session.run(embed(list(reviews['text'])))))
    end_time1=time.time()
    reviews['words_embedding'] = pd.Series(list(session.run(embed(list(reviews['main'])))))
    end_time2=time.time()
    print('time1:',(end_time1-start_time),' time2:',(end_time2-end_time1))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0409 17:17:10.252007 139967205250816 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0409 17:18:24.350903 139967205250816 saver.py:1483] Saver not created because there are no variables in the graph to restore


time1: 74.21088242530823  time2: 25.8619282245636


In [20]:
reviews

Unnamed: 0,text,main,sentence_embedding,words_embedding
0,Good product and good seller,product seller,"[0.024871454, -0.019531729, 0.009841841, 0.039...","[0.02862117, -0.026557572, 0.086525634, -0.043..."
1,They charge my wife's phone,charge wife phone,"[0.026743034, -0.064476214, -0.022157555, 0.06...","[0.028404895, -0.07067544, -0.0010278885, -0.0..."
2,Works like a charm!,Works charm,"[-0.0021315496, 0.023559086, 0.05261142, -0.03...","[0.021985114, 0.03418878, 0.055464674, -0.0456..."
3,"I've ordered a ton of these white and black, l...",'ve ordered ton,"[0.033525135, -0.031799648, -0.012317592, 0.01...","[0.036727097, -0.06557699, -0.0017807593, -0.0..."
4,I keep buying them because they are made so ...,keep buying are made,"[0.01742137, -0.02121579, 0.021088673, 0.01316...","[0.021574466, 0.00933606, 0.06083059, 0.005575..."
5,"I need more to buy for the office, my car, t...",need buy office car house,"[0.009239573, -0.05190375, -0.009112191, -0.02...","[-0.014269216, -0.04391358, 0.009408188, -0.05..."
6,I've bought a total of 3 of these cables I've ...,'ve bought total cables 've had month works,"[0.024454864, -0.030930609, 0.010781822, 0.052...","[0.04113747, -0.021795552, 0.016251244, 0.0519..."
7,Lasted longer than the ones I got from Apple,Lasted ones got Apple,"[0.04618356, -0.0048306207, 0.011567681, 0.066...","[0.038532823, -0.017767135, 0.011550971, 0.040..."
8,I needed one that worked with my lifeproof ca...,needed one worked case does,"[0.031861648, -0.008585543, 0.0022507284, 0.01...","[0.054487396, 0.017203376, -0.0057408614, -0.0..."
9,This cable works lightening-fast,cable works lightening-fast,"[0.056313805, 0.0034046846, 0.027560946, 0.016...","[0.05655513, 0.00032599864, 0.015015736, 0.002..."


In [21]:
X = np.array(reviews['sentence_embedding'].tolist())
X

array([[ 0.02487145, -0.01953173,  0.00984184, ..., -0.01322363,
        -0.03477967,  0.05130831],
       [ 0.02674303, -0.06447621, -0.02215756, ...,  0.03415874,
        -0.04212781, -0.00147286],
       [-0.00213155,  0.02355909,  0.05261142, ..., -0.01222009,
         0.04687894, -0.00129713],
       ...,
       [ 0.0052726 ,  0.00868901,  0.02693957, ..., -0.08453654,
         0.02345428, -0.08567521],
       [-0.02718677,  0.03136307,  0.02826305, ..., -0.04052224,
         0.03029509, -0.02978418],
       [ 0.01579842,  0.01172941,  0.03938049, ..., -0.01770573,
        -0.00143518, -0.02493763]], dtype=float32)

In [22]:
kmeans = KMeans(n_clusters=10)  
kmeans.fit(X)  

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [23]:
reviews['label_km_sentence'] = kmeans.labels_.tolist()

In [25]:
reviews

Unnamed: 0,text,main,sentence_embedding,words_embedding,label_km_sentence
0,Good product and good seller,product seller,"[0.024871454, -0.019531729, 0.009841841, 0.039...","[0.02862117, -0.026557572, 0.086525634, -0.043...",2
1,They charge my wife's phone,charge wife phone,"[0.026743034, -0.064476214, -0.022157555, 0.06...","[0.028404895, -0.07067544, -0.0010278885, -0.0...",9
2,Works like a charm!,Works charm,"[-0.0021315496, 0.023559086, 0.05261142, -0.03...","[0.021985114, 0.03418878, 0.055464674, -0.0456...",5
3,"I've ordered a ton of these white and black, l...",'ve ordered ton,"[0.033525135, -0.031799648, -0.012317592, 0.01...","[0.036727097, -0.06557699, -0.0017807593, -0.0...",6
4,I keep buying them because they are made so ...,keep buying are made,"[0.01742137, -0.02121579, 0.021088673, 0.01316...","[0.021574466, 0.00933606, 0.06083059, 0.005575...",7
5,"I need more to buy for the office, my car, t...",need buy office car house,"[0.009239573, -0.05190375, -0.009112191, -0.02...","[-0.014269216, -0.04391358, 0.009408188, -0.05...",7
6,I've bought a total of 3 of these cables I've ...,'ve bought total cables 've had month works,"[0.024454864, -0.030930609, 0.010781822, 0.052...","[0.04113747, -0.021795552, 0.016251244, 0.0519...",3
7,Lasted longer than the ones I got from Apple,Lasted ones got Apple,"[0.04618356, -0.0048306207, 0.011567681, 0.066...","[0.038532823, -0.017767135, 0.011550971, 0.040...",6
8,I needed one that worked with my lifeproof ca...,needed one worked case does,"[0.031861648, -0.008585543, 0.0022507284, 0.01...","[0.054487396, 0.017203376, -0.0057408614, -0.0...",9
9,This cable works lightening-fast,cable works lightening-fast,"[0.056313805, 0.0034046846, 0.027560946, 0.016...","[0.05655513, 0.00032599864, 0.015015736, 0.002...",3


In [26]:
result_save = '';

for i in range(0,9):
    temp = reviews.loc[reviews['label_km_sentence']==i,['text']]
    file_name = 'km_'+str(i)+'.txt'
    np.savetxt(r'/home/betty35/桌面/Capstone/workspace/data_testing/output/'+file_name, temp.values, fmt='%s')