In [1]:
import pandas as pd
import json

In [25]:
%%time 

with open("reviews.json") as f:
    dat = pd.DataFrame(json.loads(line) for line in f)

CPU times: user 7.06 s, sys: 1.68 s, total: 8.74 s
Wall time: 10.8 s


In [3]:
dat.sample(4)

Unnamed: 0,author,date,product_name,rating,text,title
79817,Danny V.,"on October 16, 2013",SanDisk Cruzer Fit CZ33 16GB USB 2.0 Low-Profi...,2.0 out of 5 stars,I have to keep tweaking the registry in Window...,Disappointed
179801,jrdraga,"on November 1, 2015",SanDisk Ultra Fit CZ43 16GB USB 3.0 Low-Profi...,5.0 out of 5 stars,Works exactly as expected. It is nice to have...,Two thumbs up
310227,nana n,"on September 5, 2014",SanDisk Cruzer CZ36 128GB USB 2.0 Flash Drive-...,5.0 out of 5 stars,works good,Five Stars
73478,Deogracias G.,"on June 7, 2015",Samsung 850 EVO 1 TB 2.5-Inch SATA III Interna...,5.0 out of 5 stars,It was the upgrade that my laptop needed. Boot...,It was the upgrade that my laptop needed. Boot...


## Cleaning

In [44]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer

import nltk

In [26]:
%%time

def prepare_file(raw_dat):
    ''' Read raw JSON file and convert to pkl'''

    clean_data = raw_dat.copy()
    
    # Parse ratings
    clean_data['rating'] = clean_data['rating'].str.split(' ').str[0].astype(float)

    # Parse review dates
    clean_data['date'] = pd.to_datetime(clean_data.date)
    
    return clean_data

clean_pipeline = make_pipeline(FunctionTransformer(func=prepare_file, validate=False))
cdat = clean_pipeline.fit_transform(dat)

CPU times: user 1min 25s, sys: 693 ms, total: 1min 25s
Wall time: 1min 35s


## Process single element 

"WD My Passport Ultra 2 TB Portable External USB 3.0 Hard Drive with Auto Backup, Black"

In [53]:
from sklearn import linear_model
from sklearn.cross_validation import train_test_split

In [148]:
passp = cdat[cdat.product_name == "WD My Passport Ultra 2 TB Portable External USB 3.0 Hard Drive with Auto Backup, Black"]
#passp = passp.sample(1000)

In [149]:
p = make_pipeline(CountVectorizer(ngram_range=(1,3), 
                                  min_df=2, 
                                  max_features = 2000, 
                                  stop_words=nltk.corpus.stopwords.words('english')),
                  #linear_model.RidgeCV(alphas=[80, 100, 200]))
                  linear_model.Ridge(alpha=100))

In [150]:
X_train, X_test, y_train, y_test = train_test_split(cdat.text, cdat.rating, test_size=0.1 )

In [151]:
%%time 

p.fit(X_train, y_train)

CPU times: user 2min 34s, sys: 9.27 s, total: 2min 43s
Wall time: 2min 58s


Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2000, min_df=2,
        ngram_range=(1, 3), preprocessor=None,
        stop_words=...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [153]:
p.score(X_test, y_test)  #0.457

0.45772001286789132

In [154]:
# Get the vocabularu from CountVectorizer
vocab = p.steps[0][1].vocabulary_
vocab = pd.DataFrame([vocab]).T.reset_index().set_index(0)

# Get the coefficients from Ridge regression
rcoefs = pd.DataFrame(p.steps[1][1].coef_)

# Combine into single result
res = pd.concat([vocab, rcoefs], axis=1)

res.columns=["term", "correlation"]

In [155]:
# Worse points
res.sort_values("correlation").head(20)

Unnamed: 0_level_0,term,correlation
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1884,waste,-0.814415
1455,returned,-0.796286
1456,returning,-0.760478
429,died,-0.736852
232,broke,-0.673532
436,disappointing,-0.655579
708,garbage,-0.647042
435,disappointed,-0.644655
1712,terrible,-0.638958
402,dead,-0.59524


In [156]:
# Best points
res.sort_values("correlation", ascending=False).head(20)

Unnamed: 0_level_0,term,correlation
0,Unnamed: 1_level_1,Unnamed: 2_level_1
572,excelente,0.401236
571,excelent,0.369205
573,excellent,0.354688
155,awesome,0.323583
1233,perfect,0.304605
733,go wrong,0.298034
1007,love,0.271021
1267,pleased,0.26833
108,amazing,0.267103
1547,several months,0.262605
