# Amazon review

Reference : http://jmcauley.ucsd.edu/data/amazon/

Data : <br>
Per-category files -> download 'review' on category that you wished to load.

Sample review : "key"

{ <br>
  "reviewerID": "A2SUAM1J3GNN3B", <br>
  "asin": "0000013714", <br>
  "reviewerName": "J. McDonald",  <br>
  "helpful": [2, 3],  <br>
  "reviewText": "I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!",  <br>
  "overall": 5.0,  <br>
  "summary": "Heavenly Highway Hymns",  <br>
  "unixReviewTime": 1252800000, <br>
  "reviewTime": "09 13, 2009" <br>
} 

In [1]:
import numpy as np
import gzip

In [2]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)
        
def extract(path, key):
    corpus = []
    y = []
    text = parse(path)
    for l in text:
        corpus.append(l[key])
        y.append(l['overall'])
    return corpus, y

def extract_yelp(path):
    corpus = []
    text = parse(path)
    for l in text:
        corpus.append(l[key])
        y.append(l['overall'])
    return corpus, y

In [3]:
path = r"C:\Users\Anneke\Documents\GitHub\data\reviews_Amazon_Instant_Video_5.json.gz"

In [4]:
X, y = extract(path, 'reviewText')

In [5]:
len(X)

37126

In [6]:
X[:2]

["I had big expectations because I love English TV, in particular Investigative and detective stuff but this guy is really boring. It didn't appeal to me at all.",
 'I highly recommend this series. It is a must for anyone who is yearning to watch "grown up" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.']

In [7]:
y[:10]

[2.0, 5.0, 1.0, 4.0, 5.0, 5.0, 3.0, 3.0, 5.0, 3.0]

### Playground

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split



In [9]:
token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern="(?u)\\b[\\w\\'/]+\\b",
        tokenizer=None, vocabulary=None)

In [10]:
X_train_split, X_test_split, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
X_train = tf_vectorizer.fit_transform(X_train_split)

In [12]:
X_test = tf_vectorizer.transform(X_test_split)

In [13]:
X_train.shape

(24874, 15149)

In [14]:
X_test.shape

(12252, 15149)

In [15]:
clf = LogisticRegression(random_state=42, penalty='l1')

In [16]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
clf.score(X_test,y_test)

0.6145935357492655

In [18]:
clf.score(X_train, y_train)

0.8239527217174559

### Statistics

In [41]:
import spacy
import en_core_web_sm as en

In [42]:
nlp = en.load()

In [43]:
doc = nlp(u'This is a sentence.')

In [44]:
from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)

In [55]:
tokens = tokenizer(u'This is a sentence')

In [61]:
tokens_len = []

for doc in X:
    tokens_len.append(len(tokenizer(doc)))

In [63]:
tokens_len = np.asarray(tokens_len)

In [64]:
np.max(tokens_len)

3564

In [69]:
np.min(tokens_len)

1

In [71]:
sent_len = []

for doc in X:
    x = nlp(doc)
    sent_len.append(len(list(x.sents)))

In [72]:
sent_len = np.asarray(sent_len)

In [73]:
np.max(sent_len)

185

In [74]:
np.min(sent_len)

1

In [75]:
X[np.argmax(sent_len)]

