In [208]:
import pandas as pd
import spacy
from spacy.util import minibatch


# Load in the data from JSON file
data = pd.read_csv('yelp_ratings.csv')
data.head()

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0


## Load data & create training / validation sets

In [209]:
def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file)
    
    # Shuffle data
    file_data = data.sample(frac=1, random_state=7)
    #print(train_data)
    
    texts = file_data.text.values #verbatim text of emails
    
    labels = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} 
                for y in file_data.sentiment.values]
    
    split = int(len(file_data) * split) #num of samples to take for training
    
    train_labels = [{"cats": labels} for labels in labels[:split]] #take samples from end of list
    val_labels = [{"cats": labels} for labels in labels[split:]] #take samples from beginning of list
    #print(train_labels) - this contains the labels dictionary for each review with both True and False.
    
    return texts[:split], train_labels, texts[split:], val_labels

#Load data and split into text and labels
train_texts, train_labels, val_texts, val_labels = load_data('yelp_ratings.csv')

#Create training data for model by combining training text and labels
train_data = list(zip(train_texts, train_labels))

print(train_data[:3])

[("Some of the best sushi I've ever had....and I come from the East Coast.  Unreal toro, have some of it's available.", {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ("One of the best burgers I've ever had and very well priced. I got the tortilla burger and is was delicious especially with there tortilla soup!", {'cats': {'POSITIVE': True, 'NEGATIVE': False}}), ('Review by a vegetarian family with two young kids. \n\nSeveral reviews have lamented the small number of vegetarian options on the menu and, while it is true that there are far more options for meat eaters and there is unfortunately no vegetarian noodle soup option, once you get over these 2 facts this is an excellent place for vegetarians.', {'cats': {'POSITIVE': True, 'NEGATIVE': False}})]


## Create the model

In [210]:
# Create an empty model
nlp_model = spacy.blank('en')

# Create the TextCategorizer with exclusive classes and "bow" i.e "bag of words" architecture
textcat = nlp_model.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp_model.add_pipe(textcat)

# Add labels to text categorizer
textcat.add_label("NEGATIVE")
textcat.add_label("POSITIVE")

1

## Define Training function of TextCategorizer model

In [211]:
import random

#Note: the training cannot begin without first calling begin_training()
def train_model(model, train_data, optimizer):
    model_loss = {}
    random.seed(1)
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size=8)
    for batch in batches:
        # train_data is a list of tuples [(text0, label0), (text1, label1), ...]
        # Split batch into texts and labels
        texts, labels = zip(*batch)
        
        # Update model with texts and labels
        model.update(texts, labels, sgd=optimizer, losses=model_loss)
    
    return model_loss


## Test the training of model with training data

In [212]:
# Fix seed for reproducibility
spacy.util.fix_random_seed(1)
random.seed(1)

losses = {}
optimizer = nlp_model.begin_training()

# Send all training data thru the model to train it. The epoch controls the interations of training 
# each iteration returns the losses for that iteration. Over multiple iterations, hopefully the losses converge near 0

losses = train_model(nlp_model, train_data[:100], optimizer)
print(losses)
print(losses['textcat'])

{'textcat': 0.1062515708617866}
0.1062515708617866


## Test the prediction of trained model

In [213]:
#Test the prediction process on the trained model
text = "this is a bad restaurant it sucks"
doc = nlp_model(text)

#.cats attribute of a Doc maps a label to a score for categories applied to the document. 
#The label is a string and the score should be a float.
print(doc.cats) 

{'NEGATIVE': 0.5063446760177612, 'POSITIVE': 0.4936552941799164}


## Define Predict function

In [214]:
def predict(nlp, val_texts): 
    # Use the model's tokenizer to get a list of docs from the list of validation text
    docs = [nlp.tokenizer(text) for text in val_texts]
    #for doc in docs:
    #    for token in doc:
    #        print(f"{token.text} \t\t\t {token.lemma_} \t\t\t {token.is_stop}")
    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores, tensors_not_used = textcat.predict(docs)
    
    #print("from predict->", scores) #ndarray - a numberic score for each category in the categorizer i.e Negative , Positive
    
    # From the scores, find the class with the highest score/probability
    predicted_class = scores.argmax(axis=1) #argmax with axis=1 gives the index of the highest value in each row
    #print([textcat.labels[label] for label in predicted_class])
    
    return predicted_class

#Invoke prediction
#print(val_texts[:2])
#predict(nlp_model, val_texts[:2])

#print(val_labels[:2])

In [215]:
texts = val_texts[:2]
predictions = predict(nlp_model, texts)

for p,t in zip(predictions, texts):
    print(f"{textcat.labels[p]} == {t}")

POSITIVE == This magic show was the best one I've ever seen, very funny and great magic!!! We sat in the third row center and couldn't figure out any of the tricks, how does he do it? My sons (8 & 10) absolutely loved the show. My 10 year old volunteered to help on stage, he was super excited and as an extra bonus got a magic kit as a thank you at the end. We will definetely come back to this show, it's perfect family entertainment, loved every second of it!!!
POSITIVE == There are a lot of good food places in Las Vegas, it's just a little bit harder to find them in the North side! It's located inside a gas station, but area seems to look just fine in my opinion! I may say a lot of people are nice in my reviews, but really, the owner is welcoming to his customers and if there are any adjustments that needs to be made he is more than willing to fix it or make up for it somehow. The price here is also great! I have only tried the tacos, but they're all really good with the sauce! You hav

## Define Evaluation function to evaluate model accuracy

In [216]:
def evaluate(model, texts, labels):
    """ Returns the accuracy of a TextCategorizer model. 
    
        Arguments
        ---------
        model: ScaPy model with a TextCategorizer
        texts: Text samples, from load_data function
        labels: True labels, from load_data function
    
    """
    # Get predictions from textcat model (using your predict method)
    predicted_class = predict(model, texts)
    #print("predicted_class", predicted_class)
    
    # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
    true_class = [int(cat['cats']['POSITIVE']==True) for cat in labels ]
    #print("true class=", true_class)
    
    # A boolean or int array indicating correct predictions
    correct_predictions = predicted_class == true_class
    #print("correct_predictions", correct_predictions)
    
    # The accuracy, number of correct predictions divided by all predictions
    accuracy = np.sum(correct_predictions) / len(correct_predictions)
    
    return accuracy
    

#Test evaluation function
evaluate(nlp_model, val_texts[:5], val_labels[:5])


0.6

## Train and Evaluate model

In [234]:
epoch = 10
train_count = 100
val_count = 10

#First lets train the model on a larger training set and multiple iterations
for i in range(epoch):
    losses = train_model(nlp_model, train_data[:train_count], optimizer)
    print(f"{i}: losses = {losses}")
    
#Now, lets evaluate how the model does on the validation texts
accuracy = evaluate(nlp_model, val_texts[:val_count], val_labels[:val_count])
print(f"Accuracy = {accuracy}")

#nlp_model.to_disk("./")

0: losses = {'textcat': 3.4813461700089476e-06}
1: losses = {'textcat': 3.2784053198842145e-06}
2: losses = {'textcat': 3.0675511841771197e-06}
3: losses = {'textcat': 2.8848378301749023e-06}
4: losses = {'textcat': 2.7267506208517034e-06}
5: losses = {'textcat': 2.586872273724339e-06}
6: losses = {'textcat': 2.4607064732506956e-06}
7: losses = {'textcat': 2.3454110591458743e-06}
8: losses = {'textcat': 2.239060001407722e-06}
9: losses = {'textcat': 2.1403423178245617e-06}
Accuracy = 0.6


In [232]:
#Test the prediction process on the trained model
text = "The food was disgusting and lacked flavor"
doc = nlp_model(text)

#.cats attribute of a Doc maps a label to a score for categories applied to the document. 
#The label is a string and the score should be a float.
print(doc.cats) 

{'NEGATIVE': 0.40553024411201477, 'POSITIVE': 0.5944697856903076}
