# Page for trying out the data

In [None]:
import pandas as pd
import time
import nltk
import re
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np  
from sklearn.cluster import KMeans  

In [None]:
df1 = pd.read_csv('amazon_reviews_us_Mobile_Electronics_v1_00.tsv',sep="\t", error_bad_lines=False)
# bad lines exist......

### DATA COLUMNS:
marketplace       - 2 letter country code of the marketplace where the review was written.  
customer_id       - Random identifier that can be used to aggregate reviews written by a single author.  
review_id         - The unique ID of the review.  
product_id        - The unique Product ID the review pertains to. In the multilingual dataset the reviews
                    for the same product in different countries can be grouped by the same product_id.  
product_parent    - Random identifier that can be used to aggregate reviews for the same product.  
product_title     - Title of the product.  
product_category  - Broad product category that can be used to group reviews  
                    (also used to group the dataset into coherent parts).  
star_rating       - The 1-5 star rating of the review.  
helpful_votes     - Number of helpful votes.  
total_votes       - Number of total votes the review received.  
vine              - Review was written as part of the Vine program.  
verified_purchase - The review is on a verified purchase.  
review_headline   - The title of the review.  
review_body       - The review text.  
review_date       - The date the review was written.  

Filter out the ones not versified and get a subset of the data

In [None]:
df1 = df1.loc[df1['verified_purchase']=='Y',['review_id', 'product_id', 'product_title', 'helpful_votes','review_headline', 'review_body']]
df1['product_id'].nunique()

Get a list of products that are with at least 200 reviews

In [None]:
count_df = df1.groupby('product_id').count()
count_df = count_df['review_id']

print('mean count:', count_df.mean(),'; max count:',count_df.max())

count_df = count_df.loc[lambda x: x>=200]
count_df = count_df.sort_values(ascending=False)

product_list = count_df.index.values.tolist()

In [None]:
df1 = df1[df1['product_id'].isin(product_list)]

df1[0:5]

Test on one product. Get the reviews for one product

In [None]:
reviews = df1[df1['product_id']==product_list[10]]#.drop(columns=['review_id','product_title','product_id'])
reviews = reviews['review_body']

reviews = reviews.str.replace('<br />','')

def remove_consecutive(text):
    one = re.sub(r"([eoEO])\1\1+",r"\1\1",text)
    return re.sub(r"([^eoEO])\1\1+",r"\1",one)

reviews = pd.Series(list(map(remove_consecutive,reviews)))

In [None]:
reviews = reviews.str.split("[.!?]+",expand=True).stack().reset_index() # split sentences

#remove empty lines
reviews['word_count'] = reviews[0].str.split().apply(len)#.strip().apply(len)
reviews = reviews.loc[reviews['word_count']>4,[0]]

reviews['text'] = reviews[0].str.strip()
reviews = reviews.drop(columns=0)

In [None]:
# Part of speech tagging
reviews['POS'] = reviews['text'].apply(nltk.word_tokenize).apply(nltk.pos_tag)

def getMainWords(pos):
    result = '';
    for x in pos:
        if x[1][0:2] in ['NN','VB']:
            result= result+x[0]+' '
    if len(result)>0:
        result= result[0:len(result)-1]
    return result

reviews['main'] = list(map(getMainWords,reviews['POS']))

In [None]:
reviews = reviews.loc[reviews['main'].apply(len)>0,['text','main']] #filtering out sentences without any nouns or verbs

reviews = reviews.reset_index(drop=True)# reset index

Word Embedding / Sentence Embedding

In [None]:
# Load google Universal Sentence Encoder
module_dir = "downloads/encoder"
embed = hub.Module(module_dir)

In [None]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    start_time=time.time()
    reviews['sentence_embedding'] = pd.Series(list(session.run(embed(list(reviews['text'])))))
    end_time1=time.time()
    reviews['words_embedding'] = pd.Series(list(session.run(embed(list(reviews['main'])))))
    end_time2=time.time()
    print('time1:',(end_time1-start_time),' time2:',(end_time2-end_time1))

In [None]:
reviews

In [None]:
# Preparing datasets
X1 = np.array(reviews['sentence_embedding'].tolist())
X2 = np.array(reviews['words_embedding'].tolist())

In [None]:
# Clustering
km_s = KMeans(n_clusters=10)  
km_s.fit(X1)

km_w = KMeans(n_clusters=10)
km_w.fit(X2)



In [None]:
# Get labels
reviews['label_km_sentence'] = km_s.labels_.tolist()
reviews['label_km_word'] = km_w.labels_.tolist()

In [None]:
# Save results
result_save = '';

for i in range(0,9):
    temp = reviews.loc[reviews['label_km_sentence']==i,['text']]
    temp2 = reviews.loc[reviews['label_km_word']==i,['text']]
    file_name = 'km10/sentence/'+str(i)+'.txt'
    file_name2 = 'km10/word/'+str(i)+'.txt'
    np.savetxt(r'/home/betty35/桌面/Capstone/workspace/data_testing/output/'+file_name, temp.values, fmt='%s')
    np.savetxt(r'/home/betty35/桌面/Capstone/workspace/data_testing/output/'+file_name2, temp2.values, fmt='%s')