In [None]:
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings('ignore')

import os
import sys
np.set_printoptions(threshold=sys.maxsize)

from time import time
import spacy
nlp = spacy.load("en_core_web_sm")


from utils import preprocess, preprocess1, pd_explode, sentiment
from stopwords import get_stopwords
STOPWORDS = get_stopwords()

t=time()

# READ IN REVIEWS
print('Loading Dataset...')
reviews = pd.read_csv('data/amazon_reviews_us_Electronics_v1_00.tsv', sep='\t', error_bad_lines=False)
print('Dataset Loaded: ', round(time()-t,2),'s')
print("Full Size:", reviews.shape[0], ' reviews')
reviews.head()

print('Cleaning dataframe...')
# DROP USELESS ROWS
E_simple = reviews[['product_id', 'product_title', 'star_rating', 'review_headline','review_body',
                    'review_id', 'review_date']]

# DROP NULLS
print('Pre-.dropna size: ', E_simple.shape)
E_simple.dropna(inplace=True)
print("Post-.dropna size:", E_simple.shape)

# RENAME COLUMNS FOR CONVENIENCE
E_simple.rename(columns={'star_rating':'stars'}, inplace=True)
print('Clean: ', round(time()-t,2),'s')

# DROP SHORT REVIEWS, see EDA below
E_simple = E_simple[E_simple.review_body.apply(lambda x: len(x.split())>4)]
print('Dropped short reviews: ', round(time()-t,2),'s')
print('Reviews remaining: ', E_simple.shape[0])
E_simple.to_json('data/E_clean.json')

# #SAVE
# print('Saving clean dataframe...')
# # E_simple.to_json('data/Electronics_cleaned.json')
# print('Saved: ', round(time()-t,2),'s')

total_products = E_simple.product_id.unique().shape[0]
count100_500 = E_simple.groupby('product_id').count().iloc[:,0].between(100,500).sum()
total_reviews = E_simple.shape[0]
short_reviews = E_simple[E_simple.review_body.apply(lambda x: len(x.split())<5)].shape[0]
print('# Total products: ', total_products)
print('# Total products with 100-500 reviews: ', count100_500)
print('# Total reviews: ', total_reviews)

# print('# Total reviews w/ <5 words: ', short_reviews)
print(round(time()-t,2),'s')
print()

# BREAK INTO SENTENCES
print('Breaking into sentences...')
E_sentences = E_simple
E_sentences['sentences'] = E_simple.review_body.apply(lambda rev: [sent.text for sent in nlp(rev)])
E_sentences.drop('review_body', axis=1, inplace =True)
E_sentences.to_json('data/E_sentences.json')
print('Done', round(time()-t,2),'s')

# APPLY SENTIMENT ANALYSIS
#----------->>> GO BACK AND ADJUST POLARITY SCORE
# GET polarity of each sentence #let stars skew polarity
print('Analyzing sentiment...')
E_sentences['sentence_polarity'] = E_sentences.apply(lambda row: sentiment(row.sentence, row.stars), axis=1)
print('Done', round(time()-t,2),'s')
E_sentences.to_json('data/E_sentences.json')

# #.........VALIDATION - WHAT REVIEWS ARE IN 123 STARS AND POSTIVE? 45 STARS AND NEGATIVE? NEUTRAL?
# #Negative 
# stars12 = one_sentences[(one_sentences.stars<3) & (one_sentences.sentence_polarity>0)]
# stars12

# #Positive
# stars45 = one_sentences[(one_sentences.stars>3) & (one_sentences.sentence_polarity<0)]
# stars45

# #Neutral
# stars4 = one_sentences[(one_sentences.stars==4) & (one_sentences.sentence_polarity==0)]
# stars4
# # ^^^^^^^^^^VALIDATION

# DROP ROWS


# TOKENIZE TEXT
print('Tokenizing Text. Grab a coffee. This may take a while....')
processed_corpus, bigrammer, trigrammer = preprocess(E_sentences.review_body, stopwords=STOPWORDS, max_gram=3)
print('Tokenized Text: ', round(time()-t,2),'s')

print('Saving Data & Trained Tokenizer....')
E_sentences['tokened'] = processed_corpus
E_sentences.to_json('data/Electronics_tokenized_2.json')

with open('models/bigrammer.pkl', 'wb') as f:
    pickle.dump(bigrammer,f)
with open('models/trigrammer.pkl', 'wb') as f:
    pickle.dump(trigrammer,f)
    
    #//////////// new code to get number of sentences
# SAVE A SMALLER SET FOR PRODUCTS THAT HAVE BETWEEN 20 AND 1000 REVIEWS FOR DEMO
review_counts = E_sentence.groupby('product_id').count().iloc[:,0]
products20_1000 = list(review_counts[review_counts.between(20,1000)].index)
E_small = E_sentences[E_sentences.product_id.isin(products20_1000)]
E_small.to_json('data/Electronics_tokened_20to1000_2.json')
    
print('Saved', round(time()-t,2),'s')
print('Please run vectorize.py next :)')


-------- EDA/VALIDATION --------#
## Electronics DF Validation 
> lost ~100 rows of 3,090,877 <br>
> all review_id are unique, no duplicates <br>
> start rating ranges from 1 to 5 <br>
> US marketplace only<br>
> All in Electronic category<br>
> No '' in product_title, review_headline, review_body <br>
> Reviews from June 9 1999 to Aug 31 2015<br>
> 2,315,686 4-5 stars<br>
>  775,191 1-3 stars<br>
> Somehow reduced file size down to 188.7MB from 377.3MB, but actual filesize 1.85GB


## Data EDA
### Prior to dropping short reviews
<br> num Total products:  (185766,)
<br> num Total products with 100-500 reviews:  4778
<br> num Total reviews:  3090877
<br> num Total reviews w/ <5 words:  254733
<br> reviews start being useful around 5 words long

### Post dropping short reviews
> <br> num Total products:  (176878,)
<br> num Total products with 100-500 meaningful reviews:  4406
<br> num Total reviews:  2836144
<br> num Total reviews w/ <5 words:  0


Loading Dataset...


b'Skipping line 9076: expected 15 fields, saw 22\nSkipping line 19256: expected 15 fields, saw 22\nSkipping line 24313: expected 15 fields, saw 22\nSkipping line 47211: expected 15 fields, saw 22\nSkipping line 54295: expected 15 fields, saw 22\nSkipping line 56641: expected 15 fields, saw 22\nSkipping line 63067: expected 15 fields, saw 22\n'
b'Skipping line 93796: expected 15 fields, saw 22\n'
b'Skipping line 132806: expected 15 fields, saw 22\nSkipping line 164631: expected 15 fields, saw 22\nSkipping line 167019: expected 15 fields, saw 22\nSkipping line 167212: expected 15 fields, saw 22\n'
b'Skipping line 198103: expected 15 fields, saw 22\nSkipping line 199191: expected 15 fields, saw 22\nSkipping line 202841: expected 15 fields, saw 22\nSkipping line 218228: expected 15 fields, saw 22\nSkipping line 235900: expected 15 fields, saw 22\n'
b'Skipping line 277761: expected 15 fields, saw 22\nSkipping line 304582: expected 15 fields, saw 22\nSkipping line 312029: expected 15 fields,

Dataset Loaded:  35.12 s
Full Size: 3091024  reviews
Cleaning dataframe...
Pre-.dropna size:  (3091024, 7)
Post-.dropna size: (3090877, 7)
Clean:  38.48 s
Dropped short reviews:  52.55 s
Reviews remaining:  2836144
# Total products:  176878
# Total products with 100-500 reviews:  4406
# Total reviews:  2836144
79.12 s

Breaking into sentences...


## EDA - all ngrams

In [None]:
E_simple = pd.read_json('data/D11Electronics_tokenized.json')

In [5]:
ngrams = []
for review in processed_corpus:
    for tok in review:
        if '_' in tok:
            ngrams.append(tok)
            
len(np.array(list(set(ngrams)))), np.array(list(set(ngrams)))

(1710, array(['pd_cp_p_', 'isolation_____', 'woot_deals', 'just___',
        'dave_brubeck', 'afqjcnghd_s', 'xw_nas', 'blue_ray', 'races_',
        'olp_cg_pop', 'dihle_lvsve', 'fcc_gov', 'v_moda', 'w___',
        'chaper_', 'dell_latitude', 't___t', 'mech_mods', 'dd_wrt',
        'cm_cr_pr_perm', 'strain_relief', 'sr_ph_', 'braven_lux',
        'buyers_remorse', 'f_ucking', 'brand_', 'nb_sb_noss', 'purple__',
        'vw_jetta', 'half_', 'b_', 'boston_acoustics', 'andrew_jones',
        'ntsc_m', 'southern_california', 'avr_power', 'atonline_com',
        'player_it', 'pp_play', 'iapp_controlapp', 'da_ned', 'bronco_',
        'extension_cords', 'pct_amp', 'cat_id', 'x_xywksbeiqa', 'fine_',
        'i______', 'lvxaohg_', 'bare_bone', 'you_', 'horror_stories',
        'jury_rig', 'bee_dac', 'rome_florence', 'shopping_help', 'girls_',
        'soundmatters_foxl', 'rp_hje', 'adrian_landeira', 'cricket_post',
        'u_u', 'rockford_fosgate', 'list_satish', 'page_headphones',
        'up_

### Convert tokens to keyword counts

In [1]:
from utils import kw2counts
counts = kw2counts(processed_corpus)
ngram_counts = np.array([tup for tup in counts if '_' in tup[0]])
ngram_counts.shape, ngram_counts

NameError: name 'processed_corpus' is not defined

## Plotting a distribution - some seaborn issue

In [None]:
t = time()

# number of unique products
x = [i for i in range(total_products)]

#SORTED/ NOT SORTED
rev_counts = E_simple.groupby('product_id').count().iloc[:,0].sort_values(ascending=False)

plt.figure(figsize=(26,8))
plt.bar(x[::20], rev_counts[::20], color='r', linewidth=0) #[::10] to reduce process time
plt.title('Qty Reviews for each product',{'fontsize':40})
plt.tick_params(labelsize='xx-large')
plt.ylim(0,250)

# place a text box in upper left in axes coords
textstr = '\n'.join((
                    'mean: %.1f' % np.mean(rev_counts),
                    'median: %.1f' % np.median(rev_counts)
                    ))              
props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
plt.text(0.05, 200, textstr, fontsize=24,
        verticalalignment='top', bbox=props)

plt.savefig('saved_data_models/D11Review_count_distribution.png')
plt.show()
plt.close()
print(time()-t,'s')