In [None]:
!pip install torch==1.10 transformers==4.15.0 pandas==1.1.5 numpy==1.19.5 tqdm==4.62.3
# Python 3.7.12

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
np.random.seed(25)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Context.csv")
df.head(10)

Unnamed: 0,Text,Context/Topic
0,The eternal mystique of Goldman Sachs,Politics
1,Either you don't care enough to actually tell ...,Love
2,I am such an IDIOT.,Heavy Emotion
3,While lifting weights on Friday and doing bent...,Health
4,Something's watching me,Animals
5,Plantação de palma na Indonésia é uma dura rea...,Animals
6,The Milky Way Project: Probing Star Formation ...,Science
7,"The pinnacle of American Financial Journalism,...",Joke
8,South African variant can 'break through' Pfiz...,Health
9,"New Study Finds National Nostalgia, More Than ...",Science


In [None]:
df["Context/Topic"].unique()

array(['Politics', 'Love', 'Heavy Emotion', 'Health', 'Animals',
       'Science', 'Joke', 'Compliment', 'Religion', 'Self', 'Education'],
      dtype=object)

In [None]:
df["Text"].isnull().unique()

array([False])

In [None]:
df["Context/Topic"].value_counts()

Heavy Emotion    3674
Religion         3466
Love             3229
Self             3105
Compliment       3061
Animals          2622
Health           2595
Education        2534
Joke             2476
Science          2428
Politics         2196
Name: Context/Topic, dtype: int64

In [None]:
max_length = max(df["Text"].apply(lambda x: len(x.split())).values.tolist())
max_length

1132

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df['Context/Topic'])

LabelEncoder()

In [None]:
label_encoder.classes_
print(list(label_encoder.classes_))
print(list(label_encoder.transform(list(label_encoder.classes_))))

['Animals', 'Compliment', 'Education', 'Health', 'Heavy Emotion', 'Joke', 'Love', 'Politics', 'Religion', 'Science', 'Self']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
labels = {name: no for no, name in enumerate(label_encoder.classes_)}
labels

{'Animals': 0,
 'Compliment': 1,
 'Education': 2,
 'Health': 3,
 'Heavy Emotion': 4,
 'Joke': 5,
 'Love': 6,
 'Politics': 7,
 'Religion': 8,
 'Science': 9,
 'Self': 10}

In [None]:
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['Context/Topic']]
        self.texts = [tokenizer(text, 
                        padding='max_length', max_length = 512, truncation=True,
                        return_tensors="pt") for text in df['Text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=5),
                                     [int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))

25108 3139 3139


In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, len(labels))
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=16)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

100%|██████████| 1570/1570 [38:28<00:00,  1.47s/it]


Epochs: 1 | Train Loss:  0.139                 | Train Accuracy:  0.248                 | Val Loss:  0.115                 | Val Accuracy:  0.469


100%|██████████| 1570/1570 [38:34<00:00,  1.47s/it]


Epochs: 2 | Train Loss:  0.092                 | Train Accuracy:  0.599                 | Val Loss:  0.078                 | Val Accuracy:  0.651


100%|██████████| 1570/1570 [38:35<00:00,  1.48s/it]


Epochs: 3 | Train Loss:  0.068                 | Train Accuracy:  0.695                 | Val Loss:  0.066                 | Val Accuracy:  0.686


100%|██████████| 1570/1570 [38:33<00:00,  1.47s/it]


Epochs: 4 | Train Loss:  0.057                 | Train Accuracy:  0.737                 | Val Loss:  0.061                 | Val Accuracy:  0.699


100%|██████████| 1570/1570 [38:35<00:00,  1.47s/it]


Epochs: 5 | Train Loss:  0.050                 | Train Accuracy:  0.766                 | Val Loss:  0.059                 | Val Accuracy:  0.710


In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=16)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
evaluate(model, df_test)

Test Accuracy:  0.726


In [None]:
torch.save(model, "/content/drive/MyDrive/Colab Notebooks/mrcooper_text_classification")

In [None]:
!python -V

Python 3.7.12
