In [1]:
!pip install transformers



In [2]:
labels = {'Animals': 0,
 'Compliment': 1,
 'Education': 2,
 'Health': 3,
 'Heavy Emotion': 4,
 'Joke': 5,
 'Love': 6,
 'Politics': 7,
 'Religion': 8,
 'Science': 9,
 'Self': 10}

In [3]:
import torch
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [4]:
model = torch.load("/content/drive/MyDrive/Colab Notebooks/mrcooper_text_classification", map_location=torch.device('cpu'))

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import pandas as pd
import numpy as np
np.random.seed(25)

In [7]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Context.csv")
df.head(10)

Unnamed: 0,Text,Context/Topic
0,The eternal mystique of Goldman Sachs,Politics
1,Either you don't care enough to actually tell ...,Love
2,I am such an IDIOT.,Heavy Emotion
3,While lifting weights on Friday and doing bent...,Health
4,Something's watching me,Animals
5,Plantação de palma na Indonésia é uma dura rea...,Animals
6,The Milky Way Project: Probing Star Formation ...,Science
7,"The pinnacle of American Financial Journalism,...",Joke
8,South African variant can 'break through' Pfiz...,Health
9,"New Study Finds National Nostalgia, More Than ...",Science


In [8]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=5), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

25108 3139 3139


In [9]:
from torch import nn
from transformers import BertTokenizer
import json

softmax = nn.Softmax(dim=1)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

key_list = list(labels.keys())

def inference(text):
  input = tokenizer(text, padding='max_length', max_length = 512, truncation=True,
                        return_tensors="pt")
  mask = input['attention_mask']
  input_id = input['input_ids'].squeeze(1)
  output = model(input_id, mask)

  max_prediction = output.argmax(dim=1)[0]
  predicted_label = key_list[max_prediction]
  confidence_score = softmax(output).tolist()[0][max_prediction] * 100

  data = {"statement": text, "predicted_label": predicted_label, "confidence_score": confidence_score}
  print(json.dumps(data, indent=2))

  return predicted_label, confidence_score

In [10]:
text = "Bob accepts the challenge, and the next week they're standing in Saint Peters square. 'This isnt gonna work, he's never going to see me here when theres this much people. You stay here, I'll go talk to him and you'll see me on the balcony, the guards know me too.' Half an hour later, Bob and the pope appear side by side on the balcony. Bobs boss gets a heart attack, and Bob goes to visit him in the hospital. "
inference(text)

{
  "statement": "Bob accepts the challenge, and the next week they're standing in Saint Peters square. 'This isnt gonna work, he's never going to see me here when theres this much people. You stay here, I'll go talk to him and you'll see me on the balcony, the guards know me too.' Half an hour later, Bob and the pope appear side by side on the balcony. Bobs boss gets a heart attack, and Bob goes to visit him in the hospital. ",
  "predicted_label": "Joke",
  "confidence_score": 94.98064517974854
}


('Joke', 94.98064517974854)

In [11]:
inference_df = df_test.sample(frac=0.002, random_state=5)
inference_df["predicted_label"], inference_df["confidence_score"] = zip(*inference_df["Text"].apply(inference))
inference_df

{
  "statement": "I'm an ex foreclosed home destroyer, .",
  "predicted_label": "Self",
  "confidence_score": 89.85374569892883
}
{
  "statement": "Bitter and broken to him",
  "predicted_label": "Love",
  "confidence_score": 57.71729350090027
}
{
  "statement": "Looking for feedback on a a new student-owned digital portfolio",
  "predicted_label": "Education",
  "confidence_score": 83.20160508155823
}
{
  "statement": "Importance of Data Analytics Certification - Imarticus Learning",
  "predicted_label": "Education",
  "confidence_score": 71.23622298240662
}
{
  "statement": "My main goal in life is to lessen the stigma that comes from having bipolar disorder and other mental illnesses. I struggle with panic attacks and PTSD as well. I work for a Grant funded organization where I am able to assist many students with disabilities in their struggles, the struggles I too, have faced. Please feel free to ask me anything even if it may seem slightly offensive. I will answer unless you are 

Unnamed: 0,Text,Context/Topic,predicted_label,confidence_score
9199,"I'm an ex foreclosed home destroyer, .",Self,Self,89.853746
23577,Bitter and broken to him,Love,Love,57.717294
1492,Looking for feedback on a a new student-owned ...,Education,Education,83.201605
13249,Importance of Data Analytics Certification - I...,Education,Education,71.236223
2107,My main goal in life is to lessen the stigma t...,Self,Self,70.05384
27511,"In U.S., 84% Accept Trump as Legitimate President",Politics,Politics,96.4212
