#### Read Data

In [1]:
import graphlab
products = graphlab.SFrame('data/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1504339077.log


This non-commercial license of GraphLab Create for academic use is assigned to B140007@e.ntu.edu.sg and will expire on August 27, 2018.


In [20]:
products["word_count"] = graphlab.text_analytics.count_words(products['review'])


In [22]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
for word in selected_words:
    products[word] = products["word_count"].apply(lambda x: x[word] if word in x else 0)

In [23]:
products["great"]

dtype: int
Rows: 166752
[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, ... ]

In [24]:
for word in selected_words:
    print word,products[word].sum()

awesome 2002
great 42420
fantastic 873
amazing 1305
love 40277
horrible 659
bad 3197
terrible 673
awful 345
wow 131
hate 1057


In [16]:
products['name'].show()

Canvas is accessible via web browser at the URL: http://localhost:59538/index.html
Opening Canvas in default web browser.


In [32]:
giraffe_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
giraffe_reviews['rating'].show(view='Categorical')

Canvas is accessible via web browser at the URL: http://localhost:59538/index.html
Opening Canvas in default web browser.


In [7]:
products['rating'].show(view='Categorical')

Canvas is updated and available in a tab in the default browser.


In [26]:
#ignore all 3* reviews
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'] >=4

In [27]:
products['sentiment'] = products['rating'] >=4

In [28]:
train_data,test_data = products.random_split(.8, seed=0)

In [33]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)

In [34]:
sentiment_model['coefficients']

name,index,class,value,stderr
(intercept),,1,0.729182482603,
word_count,it.,1,0.0923459975112,
word_count,recommend,1,0.351653944839,
word_count,highly,1,0.991999758017,
word_count,leak.,1,-0.318440373791,
word_count,moist,1,0.216022960857,
word_count,osocozy,1,-0.189259825548,
word_count,keps,1,4.17152894138,
word_count,was,1,-0.0426503643629,
word_count,now,1,0.0433690239388,


In [35]:
sentiment_model.evaluate(test_data)

{'accuracy': 0.916256305548883,
 'auc': 0.9446492867438502,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  1328 |
 |      0       |        0        |  4000 |
 |      1       |        1        | 26515 |
 |      1       |        0        |  1461 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9500349343413533,
 'log_loss': 0.26106698432422665,
 'precision': 0.9523039902309378,
 'recall': 0.9477766657134686,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+-------+------+
 | threshold |      fpr       |      tpr       |   p   |  n   |
 +-----------+----------------+----------------+-------+------+
 |    0.0    |      1.0       | 

In [14]:
sentiment_model.show(view='Evaluation')

Canvas is accessible via web browser at the URL: http://localhost:59538/index.html
Opening Canvas in default web browser.


In [44]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')

In [45]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)

In [36]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)

In [51]:

graphlab.SFrame.print_rows(selected_words_model['coefficients'], num_rows = 12)

+-------------+-------+-------+------------------+------------------+
|     name    | index | class |      value       |      stderr      |
+-------------+-------+-------+------------------+------------------+
| (intercept) |  None |   1   |  1.36728315229   | 0.00861805467824 |
|   awesome   |  None |   1   |  1.05800888878   |  0.110865296265  |
|    great    |  None |   1   |  0.883937894898  | 0.0217379527921  |
|  fantastic  |  None |   1   |  0.891303090304  |  0.154532343591  |
|   amazing   |  None |   1   |  0.892802422508  |  0.127989503231  |
|     love    |  None |   1   |  1.39989834302   | 0.0287147460124  |
|   horrible  |  None |   1   |  -1.99651800559  | 0.0973584169028  |
|     bad     |  None |   1   | -0.985827369929  | 0.0433603009142  |
|   terrible  |  None |   1   |  -2.09049998487  | 0.0967241912229  |
|    awful    |  None |   1   |  -1.76469955631  |  0.134679803365  |
|     wow     |  None |   1   | -0.0541450123333 |  0.275616449416  |
|     hate    |  Non

In [39]:
selected_words_model.evaluate(test_data)

{'accuracy': 0.8431419649291376,
 'auc': 0.6648096413721418,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        |  234  |
 |      0       |        1        |  5094 |
 |      1       |        1        | 27846 |
 |      1       |        0        |  130  |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.914242563530107,
 'log_loss': 0.4054747110365649,
 'precision': 0.8453551912568306,
 'recall': 0.9953531598513011,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+-------+------+
 | threshold | fpr | tpr |   p   |  n   |
 +-----------+-----+-----+-------+------+
 |    0.0    | 1.0 | 1.0 | 27976 | 5328 |
 |   1e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   2e-05   | 

In [42]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
diaper_champ_reviews['predicted_sentiment'] = selected_words_model.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)

In [53]:
print selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')
print sentiment_model.predict(diaper_champ_reviews[0:1], output_type='probability')

[0.9984234145936198]
[0.9999936520356157]
