<a href="https://colab.research.google.com/github/alberwan/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Albert_EDG_spaCy_binary_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

'''
Text Classification with SpaCy

Examples include spam detection, sentiment analysis, and tagging customer queries.
'''

In [None]:
import os

In [None]:
'''
1. load data using pandas
'''
import pandas as pd

# Loading the training data
# Yes is the label for non-spam messages
print("Please be patient, spaCy is training your model...")
spam = pd.read_csv('/content/spaCy_data/train_EDG.csv')
# spam = pd.read_csv(os.path.dirname(__file__) + '\\' + 'train_EDG.csv')
#print(spam.head(10))

In [8]:
!pip install spacy



In [None]:
'''
2. Building a Bag of Words [bow] model
'''
# Create an empty model
import spacy

nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
    "textcat",
    config={
        "exclusive_classes": True,
        # Since the classes are either ham(yes) or spam(no), we set "exclusive_classes" to True
        "architecture": "bow"})

# Add the TextCategorizer to the empty model. TextCategorizer is a spaCy pipe
nlp.add_pipe(textcat)

# Add labels to text classifier
textcat.add_label("Yes")
textcat.add_label("No")


In [10]:
'''
3. Training a Text Categorizer Model
'''
train_texts = spam['text'].values
train_labels = [{'cats': {'Yes': label == 'Yes',
                          'No': label == 'No'}}
                for label in spam['label']]

train_data = list(zip(train_texts, train_labels))
print(train_data[:3])

'''
3.1 Train the model
    1. create an optimizer using nlp.begin_training(), spaCy uses this optimizer to update the model
    2. in general it's more efficient to train models in small batches. spaCy provides the minibatch() function
    3. the minibatches are split into texts and labels
    4. nlp.update() to update the model's parameters.
    Note: This is just one training loop (or epoch) through the data. The model will typically need multiple epochs.
'''
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

'''
3.2 Use another loop for more epochs, and optionally re-shuffle the training data at the begining of each loop.
'''
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', {'cats': {'Yes': True, 'No': False}}), ('Ok lar... Joking wif u oni...', {'cats': {'Yes': True, 'No': False}}), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", {'cats': {'Yes': False, 'No': True}})]
{'textcat': 0.4318767588152923}
{'textcat': 0.6474659060055785}
{'textcat': 0.7841756387413232}
{'textcat': 0.8715840899517864}
{'textcat': 0.9279798743263192}
{'textcat': 0.96545068234607}
{'textcat': 0.9938352181410954}
{'textcat': 1.012665496608121}
{'textcat': 1.0274278897326723}
{'textcat': 1.0377139946552003}


In [11]:
'''
4. Making Predictions
    Now that you have a trained model, you can make predictions with the predict() method. 
    The input text needs to be tokenized with nlp.tokenizer. 
    Then you pass the tokens to the predict method which returns scores. 
    The scores are the probability the input text belongs to the classes.
'''
while True:
    texts = []
    text1 = input("Input your text to predict:")
    texts.append(text1)

    # texts = ["Are you ready for the tea party????? It's gonna be wild",
    #          "URGENT Reply to this message for GUARANTEED FREE TEA"]
    docs = [nlp.tokenizer(text) for text in texts]

    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    scores, _ = textcat.predict(docs)

    # [[9.9994671e-01 5.3249827e-05]  --> probability belongs to "ham"is 9.9994671e-01, probability belongs to "spam"is 5.3249827e-05, so it is "ham"
    #  [1.1798984e-02 9.8820102e-01]]
    print(scores)

    # From the scores, find the label with the highest score/probability
    predicted_labels = scores.argmax(axis=1)  # get the index of the highest probability with scores.argmax
    print("The predicted text " +"[" + text1 + "]" + " is " +
          str([textcat.labels[label] for label in predicted_labels]))

    if input("Do you want to continue [y/n]") != 'y':
        break

Input your text to predict:Are you ready for the tea party????? It's gonna be wild
[[9.9994385e-01 5.6197885e-05]]
The predicted text [Are you ready for the tea party????? It's gonna be wild] is ['Yes']
Do you want to continue [y/n]y
Input your text to predict:URGENT Reply to this message for GUARANTEED FREE TEA
[[0.01154125 0.9884588 ]]
The predicted text [URGENT Reply to this message for GUARANTEED FREE TEA] is ['No']
Do you want to continue [y/n]n
