In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
df = pd.read_csv("spamdata_v2.csv")
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df['label'].value_counts(normalize = True)

label
0    0.865937
1    0.134063
Name: proportion, dtype: float64

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(df['text'], df['label'],
                                                                    random_state=42,
                                                                    test_size=0.3,
                                                                    stratify=df['label'])


X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp,
                                                                random_state=42,
                                                                test_size=0.5,
                                                                stratify=y_temp)

In [None]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)


tokens_val = tokenizer.batch_encode_plus(
    X_val.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)


tokens_test = tokenizer.batch_encode_plus(
    X_test.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)



In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
print(train_seq)
print(train_mask)
train_y = torch.tensor(y_train.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(y_val.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(y_test.tolist())

tensor([[  101,  3125,   999,  ...,  1037,  3413,   102],
        [  101,  1045,  2123,  ...,     0,     0,     0],
        [  101,  9779,  2232,  ...,     0,     0,     0],
        ...,
        [  101,  2469,  1010,  ...,  1998,  3227,   102],
        [  101,  2498,  2021,  ...,  2253, 11047,   102],
        [  101,  7087,  1012,  ...,  2061,  1045,   102]])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

#wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
#sample the data during training
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()

        self.bert = bert
        self.dropout = nn.Dropout(0.1)
        self.relu =  nn.ReLU()
        self.linear1 = nn.Linear(768,512)
        self.linear2 = nn.Linear(512,2)
        self.softmax = nn.LogSoftmax(dim=1)


    def forward(self, sentence_id, mask):

        #passing the inputs, _ and output from the classification token
        _, cls_hidden_state = self.bert(sentence_id, attention_mask=mask, return_dict=False)

        x = self.linear1(cls_hidden_state)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.softmax(x)

        return x

In [None]:
model = BERT_Arch(bert)
model = model.to(device)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),lr = 1e-5)



In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
print(class_weights)

Class Weights: [0.57743559 3.72848948]


In [None]:
weights= torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)


cross_entropy  = nn.NLLLoss(weight=weights)

epochs = 10

In [None]:
def train():

    model.train()
    total_loss, total_accuracy = 0, 0


    total_preds=[]


    for step,batch in enumerate(train_dataloader):


        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))


        batch = [r.to(device) for r in batch]
        sentence_id, mask, labels = batch

        #clearing grads
        model.zero_grad()

        preds = model(sentence_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss = total_loss + loss.item()

        loss.backward()

        #clip the the gradients to 1
        #prevents the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        #update params
        optimizer.step()
        preds=preds.detach().cpu().numpy()


    total_preds.append(preds)


    avg_loss = total_loss / len(train_dataloader)

      #predictions are in the form of (number of batches, size of batch, number of classes)
      #reshape the predictions to (number of samples, number of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
def evaluate():

    print("\nEvaluating...")

    #deactivate dropout layers!!!!!
    model.eval()

    total_loss, total_accuracy = 0, 0

    total_preds = []

    for step,batch in enumerate(val_dataloader):

        if step % 50 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))


        batch = [t.to(device) for t in batch]

        sentence_id, mask, labels = batch

        #deactivate autograd ????
        with torch.no_grad():

            preds = model(sentence_id, mask)

            loss = cross_entropy(preds,labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

    avg_loss = total_loss / len(val_dataloader)

    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

epochs = 1

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    train_loss, _ = train()
    valid_loss, _ = evaluate()

    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), 'saved_weights.pt')

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 1
  Batch    50  of    122.
  Batch   100  of    122.

Evaluating...

Training Loss: 0.684
Validation Loss: 0.664


In [None]:
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94       435
           1       0.68      0.28      0.40        67

    accuracy                           0.89       502
   macro avg       0.79      0.63      0.67       502
weighted avg       0.87      0.89      0.87       502

