In [9]:
import pandas as pd

spam = pd.read_csv('./data/spam.csv')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [2]:
import spacy

nlp = spacy.blank('en')

textcat = nlp.add_pipe('textcat')

In [3]:
textcat.add_label('ham')  # real messages
textcat.add_label('spam') # spam messages

1

In [4]:
train_texts = spam['text'].values

train_labels = [
    {'cats': {
        'ham':  label == 'ham',
        'spam': label == 'spam',
        },
    } for label in spam['label']
]

In [5]:
train_data = list(zip(train_texts, train_labels))
train_data[:5]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}}),
 ('U dun say so early hor... U c already then say...',
  {'cats': {'ham': True, 'spam': False}}),
 ("Nah I don't think he goes to usf, he lives around here though",
  {'cats': {'ham': True, 'spam': False}})]

In [7]:
from spacy.training.example import Example
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.initialize()

batches = minibatch(train_data, size = 8)

num_elements_in_batches = 0
for batch in batches:
    num_elements_in_batches += 1
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd = optimizer)
        
print(num_elements_in_batches)

697


In [10]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)

optimizer = nlp.initialize()
losses = {}

for epoch in range(10):
    random.shuffle(train_data)
    
    batches = minibatch(train_data, size = 8)
    
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd = optimizer, losses = losses)
            
    print(losses)

{'textcat': 708.4780089379459}


In [12]:
texts = [
    "Are you ready for the tea party????? It\'s gonna be wild!!!",
    "URGENT Reply to this message for GUARANTEED FREE TEA",
]

docs = [nlp.tokenizer(text) for text in texts]

textcat = nlp.get_pipe('textcat')
scores = textcat.predict(docs)

print(scores)

[[9.9999452e-01 5.4865004e-06]
 [1.6431263e-04 9.9983561e-01]]


In [13]:
predicted_labels = scores.argmax(axis = 1)
print(
    [textcat.labels[label] for label in predicted_labels]
)

['ham', 'spam']


In [15]:
##################################

In [21]:
data = pd.read_csv('./data/yelp_ratings.csv')
data.head(10)

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0
5,I'll be the first to admit that I was not exci...,4.0,1
6,This place has gone down hill. Clearly they h...,1.0,0
7,I was really looking forward to visiting after...,2.0,0
8,"Like walking back in time, every Saturday morn...",4.0,1
9,"Walked in around 4 on a Friday afternoon, we s...",1.0,0


In [29]:
def load_data(csv_file_path, split = 0.9):
    data = pd.read_csv(csv_file_path)
    
    train_data = data.sample(frac = 1, random_state = 1) # shuffle
    
    texts = train_data['text'].values
    labels = [
        {
            'POSITIVE': bool(y),
            'NEGATIVE': not bool(y),
        } for y in train_data['sentiment'].values
    ]
    split = int(len(train_data) * split)
    
    train_labels = [{'cats': labels} for labels in labels[:split]]
    valid_labels = [{'cats': labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], valid_labels

train_texts, train_labels, valid_texts, valid_labels = load_data('./data/yelp_ratings.csv')

In [32]:
from pprint import pprint

pprint(list(
    zip(train_texts[:2], train_labels[:2])
))

[('I ordered nachos, soft serve ice cream, and a salad. Probably the easiest '
  "things to prepare. It took a half hour and, if I hadn't looked in the bag-I "
  "ordered everything to go-I wouldn't have realized that they forgot an item. "
  "TERRIBLE service and they didn't care on bit. I will NEVER go back!",
  {'cats': {'NEGATIVE': True, 'POSITIVE': False}}),
 ('The restaurant with the food truck theme was very good. The overall '
  'establishment was very clean and all the arcade games worked. The staff was '
  'attentive and polite. It was a great experience. \n'
  '\n'
  'One particular instance stands out. We were going to the go carts and we '
  'had one family with one adult and two kids. All three of them are going to '
  "sit it out because they didn't want to keep one kid out. However, one of "
  'the employees really stepped up and went with one of the kids. This was '
  'unexpected and really above and beyond.\n'
  '\n'
  'We will be back.',
  {'cats': {'NEGATIVE': False

In [33]:
nlp = spacy.blank('en')

textcat = nlp.add_pipe('textcat')

textcat.add_label('NEGATIVE')
textcat.add_label('POSITIVE')

1

In [36]:
def train(model, train_data, optimizer, batch_size = 8):
    losses = {}
    
    random.seed(1)
    random.shuffle(train_data)
    
    for batch in minibatch(train_data, size = batch_size):
        for text, labels in batch:
            doc = model.make_doc(text)
            example = Example.from_dict(doc, labels)
            model.update([example], sgd = optimizer, losses = losses)
            
    return losses

In [37]:
optimizer = nlp.initialize()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)

In [38]:
losses

{'textcat': 6410.319330349245}

In [39]:
def predict(model, texts):
    docs = [model.tokenizer(text) for text in texts]
    scores = textcat.predict(docs)
    
    predicted_class = scores.argmax(axis = 1)
    
    return predicted_class

In [41]:
texts = valid_texts[34:38]
predictions = predict(nlp, texts)

for prediction, text in zip(predictions, texts):
    print(
        f"{textcat.labels[prediction]}: {text} \n"
    )

POSITIVE: We eat here on a regular basis.  It's like that little hole in the wall place that has tasty food for reasonable prices here!  I can't vouch for their full menu... we always get the chicken in spicy garlic sauce... that's it.  They have lunch specials that come with a spring roll and fried or white rice... its delicious... love it.  White meat chicken and lots of veggies.  By hole in the wall I don't mean it in a bad way... these are the places that have the best food a lot of times.  Place always looks decently clean... if decently is even a word... but I love my dish... I would suggest it to anyone. 

POSITIVE: OMG! 1st off...let me say that I ate at Guy's restaurant in New York. It was ok. nothing special. But his restaurant in Las Vegas just blew me away! Humongous portions, compatable with their prices. The drinks were great. I had the mac and cheese burger (my favorite thing on earth!). Big, juicy patties with creamy mac and cheese), served with 4 kinds of fries (each w

In [48]:
def evaluate(model, texts, labels):
    predicted_class = predict(model, texts)
    
    true_class = [
        int(label['cats']['POSITIVE'] == True) for label in labels
    ]
    
    correct_predictions = [
        predicted_class[i] == true_class[i] for i in range(len(predicted_class))
    ]
    
    accuracy = sum(correct_predictions) / len(correct_predictions)
    
    return accuracy

In [None]:
evaluate(nlp, valid_texts, valid_labels)