In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
import pandas as pd
spam = pd.read_csv('spam.csv')
spam.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Bag of Words - create a empty model and create TextCategorizer and add it to the empty model


In [2]:
import spacy
nlp = spacy.blank('en')

In [3]:
textcat = nlp.create_pipe('textcat', config={
        "exclusive_classes": True,
         "architecture": "bow"
                                            })

nlp.add_pipe(textcat)

In [4]:
textcat.add_label("ham")
textcat.add_label("spam")


1

In [5]:
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]
train_data = list(zip(train_texts, train_labels))

In [6]:
print(spam.head(3))
print(train_data[1])

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}})


# ready to train the model.

In [7]:
import random
from  spacy.util import minibatch

In [8]:

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 1.3370912857571966}
{'textcat': 1.6728009708220952}
{'textcat': 1.8620019962684857}
{'textcat': 1.983643345719873}
{'textcat': 2.0654277985768807}
{'textcat': 2.1180278761161846}
{'textcat': 2.154979879577126}
{'textcat': 2.1797390881191423}
{'textcat': 2.1985677412913955}
{'textcat': 2.2125724875189343}


# Making Predictions

In [13]:
texts = ["URGENT Reply to this message for GUARANTEED FREE TEA","In 1980 IDBI Bank Rejected Loan 4 Ambani.","ShowLion.com And get FRee Rs500 +100% First Deposit Bonus+250G Gold Bar"]
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')

scores, _ = textcat.predict(docs)


print(scores)

[[0.0192931  0.98070693]
 [0.86853534 0.13146468]
 [0.9930305  0.00696957]]


The scores are used to predict a single class or label by choosing the label with the highest probability. You get the index of the highest probability with scores.argmax, then use the index to get the label string from textcat.labels.

In [14]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['spam', 'ham', 'ham']
