# Classifying Food Reviews

Data __Amazon Food Reviews__
* We are using a transformed version of this 500k dataset
* Reference: https://snap.stanford.edu/data/web-FineFoods.html

In [None]:
!ls

In [None]:
# Run this if you do not already have Reviews.csv downloaded
!wget https://www.dropbox.com/s/fxtgg2v2r8lua01/Reviews.csv?dl=0 -O Reviews.csv  

In [None]:
!ls

In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 48 kB/s  eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
food_reviews_df=pd.read_csv('Reviews.csv')
food_reviews_df.shape

In [None]:
food_reviews_df.head().T

In [None]:
food_reviews_df = food_reviews_df[['Text', 'Score']].dropna()

In [None]:
ax=food_reviews_df.Score.value_counts().plot(kind='bar')
fig = ax.get_figure()
fig.savefig("score.png");

In [None]:
food_reviews_df.loc[food_reviews_df.Score <= 3, "Score"] = 0
food_reviews_df.loc[food_reviews_df.Score >= 4, "Score"] = 1

In [None]:
ax=food_reviews_df.Score.value_counts().plot(kind='bar')
fig = ax.get_figure()
fig.savefig("score_boolean.png");

In [None]:
food_reviews_df.head()

In [None]:
train_pos_df=food_reviews_df[food_reviews_df.Score==1][:5000]
train_neg_df=food_reviews_df[food_reviews_df.Score==0][:5000]

In [None]:
train_df=train_pos_df.append(train_neg_df)
train_df.shape

## Pre-Processing
### Tokenization

In [None]:
nlp = spacy.load('en_core_web_sm') # create nlp object for English
sample_review = food_reviews_df.Text[101]
sample_review

In [None]:
parsed_review = nlp(sample_review)
parsed_review

## Parts of Speech Tagging

In [None]:
tokenized_text = pd.DataFrame()

for i, token in enumerate(parsed_review):
    tokenized_text.loc[i, 'text'] = token.text
    tokenized_text.loc[i, 'lemma'] = token.lemma_,
    tokenized_text.loc[i, 'pos'] = token.pos_
    tokenized_text.loc[i, 'tag'] = token.tag_
    tokenized_text.loc[i, 'dep'] = token.dep_
    tokenized_text.loc[i, 'shape'] = token.shape_
    tokenized_text.loc[i, 'is_alpha'] = token.is_alpha
    tokenized_text.loc[i, 'is_stop'] = token.is_stop
    tokenized_text.loc[i, 'is_punctuation'] = token.is_punct

tokenized_text[:20]

## Named Entity Recognition (NER)

| Type        | Description                                          |
| :---------- | :--------------------------------------------------- |
| PERSON      | People, including fictional.                         |
| NORP        | Nationalities or religious or political groups.      |
| FAC         | Buildings, airports, highways, bridges, etc.         |
| ORG         | Companies, agencies, institutions, etc.              |
| GPE         | Countries, cities, states.                           |
| LOC         | Non-GPE locations, mountain ranges, bodies of water. |
| PRODUCT     | Objects, vehicles, foods, etc. (Not services.)       |
| EVENT       | Named hurricanes, battles, wars, sports events, etc. |
| WORK_OF_ART | Titles of books, songs, etc.                         |
| LAW         | Named documents made into laws.                      |
| LANGUAGE    | Any named language.                                  |
| DATE        | Absolute or relative dates or periods.               |
| TIME        | Times smaller than a day.                            |
| PERCENT     | Percentage, including "%".                           |
| MONEY       | Monetary values, including unit.                     |
| QUANTITY    | Measurements, as of weight or distance.              |
| ORDINAL     | "first", "second", etc.                              |
| CARDINAL    | Numerals that do not fall under another type         |


In [None]:
from spacy import displacy

displacy.render(parsed_review, style="ent")
# use spacy.explain('tag') if needed

## Dependency parsing

Identifies sentences, assigning a syntactic structure to it (subject-object)

In [None]:
sentence_spans = list(parsed_review.sents)
sentence_spans

In [None]:
options = {'compact': True, 'bg': 'white','distance': 80,
           'color': 'green', 'font': 'Arial'}
displacy.render(parsed_review, jupyter=True, style='dep', options=options)

## Processing noun chunks 

The dependency parser adds the `token.dep` and `token.head` attributes
Further, it is also responsible for **noun chunks**: detecting sentences and base noun phrases

In [None]:
noun_chunks_df = pd.DataFrame()

for i, chunk in enumerate(parsed_review.noun_chunks):
    noun_chunks_df.loc[i, 'text'] = chunk.text
    noun_chunks_df.loc[i, 'root'] = chunk.root,
    noun_chunks_df.loc[i, 'root.text'] = chunk.root.text,
    noun_chunks_df.loc[i, 'root.dep_'] = chunk.root.dep_
    noun_chunks_df.loc[i, 'root.head.text'] = chunk.root.head.text

noun_chunks_df[:20]

## Text Classification

By default, spaCy's text categorizer uses CNN to assign position sensitive vectors to each word in the document. First, prepare the data spaCy expects, in the form of tuples.

In [None]:
train_df['tuples'] = train_df.apply(
    lambda row: (row['Text'],row['Score']), axis=1)
train = train_df['tuples'].tolist()
train[:1]

In [None]:
#functions from spacy documentation
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from","t" , int)
n_texts=3000
#You can increase texts count if you have more computational power.

#("Number of training iterations", "n", int))
n_iter=10

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
  textcat = nlp.create_pipe('textcat')
  nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading food reviews data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))

train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))


## Training the model

In [None]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading food reviews data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))


## Test the model

In [None]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

In [None]:
# test the trained model
test_text_neg = '"we hated the service so much that we left without paying a tip"'
test_text_pos ="We found the atmosphere warm and the food was delicious."
doc_neg = nlp(test_text_neg)
doc_neg.cats

In [None]:
doc_pos = nlp(test_text_pos)
test_text_pos, doc_pos.cats

In [None]:
!mkdir review_model

In [None]:
!ls

## Saving the model 

## Using the saved model

In [None]:
nlp.to_disk('review_model/model')

In [None]:
nlp2 = spacy.load('review_model')

In [None]:
text = "we like coming here a lot"
doc = nlp2(text)
print(text, doc.cats)

In [None]:
text = "The environment was not great"
doc = nlp2(text)
print(text, doc.cats)