In [1]:
# default_exp autocoder

In [2]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Auto Coder

> Automatically codes text fields such as open-ended survey questions based on lingustic properties such as topic and sentiment.

In [3]:
#hide
from nbdev.showdoc import *

In [4]:
#export

import math
import warnings
import numpy as np
import pandas as pd

def list2chunks(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

In [10]:
#export

class ZeroShotClassifier():
    """
    Interface to Zero Shot Topic Classifier
    """

    def __init__(self, model_name='facebook/bart-large-mnli', device=None):
        """
        ZeroShotClassifier constructor

        Args:
          model_name(str): name of a BART NLI model
          device(str): device to use (e.g., 'cuda', 'cpu')
        """
        if 'mnli' not in model_name and 'xnli' not in model_name:
            raise ValueError('ZeroShotClasifier requires an MNLI or XNLI model')
        try:
            import torch
        except ImportError:
            raise Exception('ZeroShotClassifier requires PyTorch to be installed.')
        self.torch_device = device
        if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        from transformers import AutoModelForSequenceClassification, AutoTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.torch_device)


    def predict(self, docs, labels=[], include_labels=False, multilabel=True,
               max_length=512, batch_size=8, nli_template='This text is about {}.',  topic_strings=[]):
        """
        This method performs zero-shot text classification using Natural Language Inference (NLI).


        **Parameters**:
          - docs(list|str): text of document or list of texts
          - labels(list): a list of strings representing topics of your choice
                          Example:
                           labels=['political science', 'sports', 'science']
          - include_labels(bool): If True, will return topic labels along with topic probabilities
          - multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1.
                            If False, scores are normalized such that probabilities sum to 1.
          - max_length(int): truncate long documents to this many tokens
          - batch_size(int): batch_size to use. default:8
                           Increase this value to speed up predictions - especially
                           if len(topic_strings) is large.
          - nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
          - topic_strings(list): alias for labels parameter for backwards compatibility
          
        **Returns:**
        
        
          inferred probabilities or list of inferred probabilities if doc is list
        """

        # error checks
        is_str_input = False
        if not isinstance(docs, (list, np.ndarray)): 
            docs = [docs]
            is_str_input = True
        if not isinstance(docs[0], str): raise ValueError('docs must be string or a list of strings representing document(s)')
        if len(labels) > 0 and len(topic_strings) > 0: raise ValueError('labels and topic_strings are mutually exclusive')
        if not labels and not topic_strings: raise ValueError('labels must be a list of strings')
        if topic_strings: 
            labels = topic_strings


        # convert to sequences
        sequence_pairs = []
        for premise in docs:
            sequence_pairs.extend([[premise, nli_template.format(label)] for label in labels])
        if batch_size  > len(sequence_pairs): batch_size = len(sequence_pairs)
        if len(sequence_pairs) >= 100 and batch_size==8:
            warnings.warn('TIP: Try increasing batch_size to speedup ZeroShotClassifier predictions')
        num_chunks = math.ceil(len(sequence_pairs)/batch_size)
        sequence_chunks = list2chunks(sequence_pairs, n=num_chunks)

        # inference
        import torch
        with torch.no_grad():
            outputs = []
            for sequences in sequence_chunks:
                batch = self.tokenizer.batch_encode_plus(sequences, return_tensors='pt', max_length=max_length, truncation='only_first', padding=True).to(self.torch_device)
                logits = self.model(batch['input_ids'], attention_mask=batch['attention_mask'], return_dict=False)[0]
                outputs.extend(logits.cpu().detach().numpy())
        outputs = np.array(outputs)
        outputs = outputs.reshape((len(docs), len(labels), -1))

        # process outputs
        if multilabel:
            # softmax over the entailment vs. contradiction dim for each label independently
            entail_contr_logits = outputs[..., [0, -1]]
            scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
            scores = scores[..., 1]
        else:
            # softmax the "entailment" logits over all candidate labels
            entail_logits = outputs[..., -1]
            scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
        scores = scores.tolist()
        if include_labels:
            scores = [list(zip(labels, s)) for s in scores]
        if is_str_input: scores = scores[0]
        return scores



In [9]:
show_doc(ZeroShotClassifier.predict)

<h4 id="ZeroShotClassifier.predict" class="doc_header"><code>ZeroShotClassifier.predict</code><a href="__main__.py#L29" class="source_link" style="float:right">[source]</a></h4>

> <code>ZeroShotClassifier.predict</code>(**`docs`**, **`labels`**=*`[]`*, **`include_labels`**=*`False`*, **`multilabel`**=*`True`*, **`max_length`**=*`512`*, **`batch_size`**=*`8`*, **`nli_template`**=*`'This text is about {}.'`*, **`topic_strings`**=*`[]`*)

This method performs zero-shot text classification using Natural Language Inference (NLI).


**Parameters**:
  - docs(list|str): text of document or list of texts
  - labels(list): a list of strings representing topics of your choice
                  Example:
                   labels=['political science', 'sports', 'science']
  - include_labels(bool): If True, will return topic labels along with topic probabilities
  - multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1.
                    If False, scores are normalized such that probabilities sum to 1.
  - max_length(int): truncate long documents to this many tokens
  - batch_size(int): batch_size to use. default:8
                   Increase this value to speed up predictions - especially
                   if len(topic_strings) is large.
  - nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
  - topic_strings(list): alias for labels parameter for backwards compatibility
  
**Returns:**
  inferred probabilities or list of inferred probabilities if doc is list

In [6]:
zsl = ZeroShotClassifier()
labels=['politics', 'elections', 'sports', 'films', 'television']
doc = 'I am extremely dissatisfied with the President and will definitely vote in 2020.'
preds = zsl.predict(doc, labels=labels, include_labels=True)

In [7]:
preds

[('politics', 0.979189932346344),
 ('elections', 0.9874580502510071),
 ('sports', 0.0005765454261563718),
 ('films', 0.002292441902682185),
 ('television', 0.001054605352692306)]

In [8]:
d = dict(preds)
assert d['politics'] > 0.9
assert d['elections'] > 0.9
assert d['sports'] < 0.1
assert d['films'] < 0.1
assert d['television'] < 0.1

In [11]:
#export
class Autocoder:
    """
    Autocodes text fields
    """
    def __init__(self, verbose=1):
        """
        Instantiates the Autocoder instance.
        """
        self.v = verbose
        self.zsl = ZeroShotClassifier()


    def _format_to_df(self, results, df):
        d = {}
        for e in results:
            if isinstance(e, dict): e = e.items()
            for tup in e:
                label = tup[0]
                prob = tup[1]
                lst = d.get(label, [])
                lst.append(prob)
                d[label] = lst
        new_df = df.join(pd.DataFrame(d, index=df.index))      
        return new_df
    
    def _binarize_df(self, df, colnames, threshold=0.5):
        """
        Binarizes each column in `colnames` based on threshold.
        """
        for col in colnames:
            df[col] = (df[col] >= threshold).astype(int)
        return df
        

    def code_sentiment(self, docs, df, batch_size=8, binarize=False, threshold=0.5):
        """
        Autocodes text for positive or negative sentiment
        """
        labels = ['negative', 'positive']
        results = self.zsl.predict(docs, labels=labels, include_labels=True, multilabel=False,
                              batch_size=batch_size,
                              nli_template="The sentiment of this movie review is {}.")
        df= self._format_to_df(results, df)   
        if binarize: df = self._binarize_df(df, labels, threshold=threshold)
        return df
        
    def code_emotion(self, docs, df, batch_size=8, binarize=False, threshold=0.5):
        """
        Autocodes text for emotion
        """
        labels = ['joy', 'anger', 'fear', 'sadness']
        results = self.zsl.predict(docs, labels=labels, include_labels=True, multilabel=False,
                              batch_size=batch_size,
                              nli_template="The emotion of this text is {}.")
        df= self._format_to_df(results, df)   
        if binarize: df = self._binarize_df(df, labels, threshold=threshold)
        return df       
    
    def code_custom_topics(self, docs, df, labels, batch_size=8, binarize=False, threshold=0.5):
        """
        Autocodes text for user-specified topics.
        The `label` field is the name of the topic as a string (or a list of them.)
        """
            
        results = self.zsl.predict(docs, labels=labels, include_labels=True, batch_size=8)
        df = self._format_to_df(results, df)    
        if binarize: df = self._binarize_df(df, labels, threshold=threshold)
        return df
    
    def code_callable(self, docs, df, fn):
        """
        Autocodes text for any user-specified function
        The `fn` parameter must be a Callable and return a dictionary for each
        text in `docs` where the keys are desired column names and values are scores
        or probabilities.
        """
        
        results = self.zsl.predict(docs, labels=labels, include_labels=True, batch_size=8)
        df = self._format_to_df(results, df)    
        if binarize: df = self._binarize_df(df, labels, threshold=threshold)
        return df




In [12]:
show_doc(Autocoder.code_sentiment)

<h4 id="Autocoder.code_sentiment" class="doc_header"><code>Autocoder.code_sentiment</code><a href="__main__.py#L36" class="source_link" style="float:right">[source]</a></h4>

> <code>Autocoder.code_sentiment</code>(**`docs`**, **`df`**, **`batch_size`**=*`8`*, **`binarize`**=*`False`*, **`threshold`**=*`0.5`*)

Autocodes text for positive or negative sentiment

In [13]:
show_doc(Autocoder.code_custom_topics)

<h4 id="Autocoder.code_custom_topics" class="doc_header"><code>Autocoder.code_custom_topics</code><a href="__main__.py#L60" class="source_link" style="float:right">[source]</a></h4>

> <code>Autocoder.code_custom_topics</code>(**`docs`**, **`df`**, **`labels`**, **`batch_size`**=*`8`*, **`binarize`**=*`False`*, **`threshold`**=*`0.5`*)

Autocodes text for user-specified topics.
The `label` field is the name of the topic as a string (or a list of them.)

In [14]:
show_doc(Autocoder.code_emotion)

<h4 id="Autocoder.code_emotion" class="doc_header"><code>Autocoder.code_emotion</code><a href="__main__.py#L48" class="source_link" style="float:right">[source]</a></h4>

> <code>Autocoder.code_emotion</code>(**`docs`**, **`df`**, **`batch_size`**=*`8`*, **`binarize`**=*`False`*, **`threshold`**=*`0.5`*)

Autocodes text for emotion

## Usage Examples

This is what the dataframe looks like before autocoding for sentiment:

In [10]:
ac = Autocoder()
reviews = ["I loved this doctor!", "This doctor was absolutely terrible."]
df = pd.DataFrame({
    'gender': ['female', 'male'],
     'review' : reviews,
      })
df.head()

Unnamed: 0,gender,review
0,female,I loved this doctor!
1,male,This doctor was absolutely terrible.


After autocoding for sentiment, the dataframe now has extra columns:

In [11]:
result_df = ac.code_sentiment(df['review'].values, df)
result_df.head()

Unnamed: 0,gender,review,negative,positive
0,female,I loved this doctor!,0.005034,0.994966
1,male,This doctor was absolutely terrible.,0.981789,0.018211


In [13]:
assert result_df[result_df['gender']=='female']['negative'].values[0] < 0.1
assert result_df[result_df['gender']=='female']['positive'].values[0] > 0.9
assert result_df[result_df['gender']=='male']['negative'].values[0] > 0.9
assert result_df[result_df['gender']=='male']['positive'].values[0] < 0.1

Here is what the dataframe looks like before autocoding for custom, user-specified topics:

In [14]:
comments = ["What is your favorite sitcom of all time?", 'I cannot wait to vote!']
df = pd.DataFrame({
    'over_18': ['yes', 'no'],
     'comments' : comments,
      })
df.head()

Unnamed: 0,over_18,comments
0,yes,What is your favorite sitcom of all time?
1,no,I cannot wait to vote!


After autocoding, the dataframe has a new column for each custom topic:

In [15]:
result_df = ac.code_custom_topics(df['comments'].values, df, labels=['television', 'film', 'politics'])
result_df.head()

Unnamed: 0,over_18,comments,television,film,politics
0,yes,What is your favorite sitcom of all time?,0.981327,0.01226,0.000157
1,no,I cannot wait to vote!,0.000518,0.004943,0.936988


In [16]:
assert result_df[result_df['over_18']=='yes']['television'].values[0] > 0.9
assert result_df[result_df['over_18']=='yes']['film'].values[0] < 0.1
assert result_df[result_df['over_18']=='yes']['politics'].values[0] < 0.1
assert result_df[result_df['over_18']=='no']['television'].values[0] < 0.1
assert result_df[result_df['over_18']=='no']['film'].values[0] < 0.1
assert result_df[result_df['over_18']=='no']['politics'].values[0] > 0.9

In [17]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_causalinference.ipynb.
Converted 01_autocoder.ipynb.
Converted examples.ipynb.
Converted index.ipynb.
