## Imports

In [504]:
import os
import zipfile

import time
import datetime
import re
import contractions
import pandas as pd
import numpy as np
from typing import Union, Tuple

# natural language processing
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob  # imported to correct text

# machine learning
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split, RandomSampler, SequentialSampler

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.data import Dataset

# using version 4.20.1
from transformers import DebertaTokenizer
from transformers import DebertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup

## Extract Data

In [4]:
# get zipped file name
file = "nlp-getting-started.zip"

# check if file for data exists and create if does not
os.makedirs("data", exist_ok=True)

# unzip file and save to 'data' folder
with zipfile.ZipFile(file, "r") as zip_ref:
    zip_ref.extractall("data")

## Analyze Data

In [None]:
# get tweets with disaster
train_disaster_tweets = train[train.target==1].text

# get tweets without disaster
train_no_disaster_tweets = train[train.target==0].text

# check if url is associated with disaster
print(f"Proportion of tweets associated with disaster with url: {100 * sum('http://' in t for t in train_disaster_tweets)/len(train_disaster_tweets)}%")
print(f"Proportion of tweets not associated with disaster with url: {100 * sum('http://' in t for t in train_no_disaster_tweets)/len(train_no_disaster_tweets)}%\n")

# check if mentions is associated with disaster
print(f"Proportion of tweets associated with disaster with @: {100 * sum('@' in t for t in train_disaster_tweets)/len(train_disaster_tweets)}%")
print(f"Proportion of tweets not associated with disaster with @: {100 * sum('@' in t for t in train_no_disaster_tweets)/len(train_no_disaster_tweets)}%")

## Preprocess and encode text

In [589]:
# preprocess function
def preprocess_text(doc):
    
    preprocessed_doc = []
    stopwords = nltk.corpus.stopwords.words("english")
    
    for text in doc:
        
        # make lowercase
        text = text.lower()
        
        # remove urls
        # text = tf.strings.regex_replace(text, "(?:https?)?:\/\/t.co\/\w*", " ")
        text = re.sub(r"(?:https?)?:\/\/t.co\/\w*", " ", text)
        
        # remove mentions
        # text = tf.strings.regex_replace(text, "@\w+", " ")
        text = re.sub(r"@\w+", " ", text)
        
        # remove emoji pattern
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        
        text = emoji_pattern.sub(r'', text)
        
        # correct typos
        # text = TextBlob(text).correct().string
        
        # remove contractions
        word_list = [contractions.fix(word) for word in text.split()]
        
        # remove non-alphabetical characters
        word_list = [word for word in word_list if word.isalnum()]
        
        # remove stop words
        word_list = [word for word in word_list if word not in stopwords]
        
        # join to single string
        cleaned_tweet = " ".join(word_list)
        
        # reappend to preprocessed doc
        preprocessed_doc.append(cleaned_tweet)
    
    return preprocessed_doc


def encode_text(texts, tokenizer, max_len=512):
    
    # list of ids and attention masks
    encoded_list = []
    
    for text in texts:
                
        # use tokenizer to encode text
        encoded_text = tokenizer(
            text = text,
            max_length = max_len,
            padding = "max_length",
            truncation = True,
            return_attention_mask=True,
        )
        
        # extend id list
        encoded_list.append(encoded_text["input_ids"] + encoded_text["attention_mask"])
        
    return torch.tensor(encoded_list)

## Create Model

In [543]:
# create model
class Deberta_layer(nn.Module):  # inherits from torch.nn.Module
    def __init__(self):
        super(Deberta_layer, self).__init__()
        self.deberta = DebertaForSequenceClassification.from_pretrained(
            "microsoft/deberta-base",     # base model
            num_labels = 2,               # number of outputs
            output_attentions = False,    # returns attention weights of all layers
            output_hidden_states = False  # returns hidden states of all layers
        )
        
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_masks: torch.Tensor,
        target: Union[torch.FloatTensor, None]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        
        # if there is a target then return loss and prediction
        if target != None:
            output = self.deberta(
                input_ids=input_ids,
                token_type_ids=None,
                attention_mask=attention_masks,
                labels=target,
                return_dict=None
            )
            
            return output["loss"], output["logits"]
        
        else:
            output = self.deberta(
                intput_ids=input_ids,
                token_type_ids=None,
                attention_mask=attention_masks,
                labels=None,
                return_dict=None
            )
            
            return output["logits"]
    

## Train model

In [391]:
# model parameters
lr = 3e-4
betas = (0.9,0.98)
eps = 1e-8
n_splits = 5
test_size = 0.2
random_state = 42
n_epochs = 10
batch_size = 16
max_len = 512

# train model on training strata
def train_model_strata(model, optimizer, input_ids, attention_masks):
    
    return

# test model on testing strata
def test_model_strata(model, optimizer, input_ids, attention_masks):
    
    return 
    
# make a trained model
def make_trained_model(lr, test_size, n_splits, n_epochs, batch_size, max_len, betas, eps, random_state):
    
    # read in data
    train_data = pd.read_csv("./data/train.csv")
    test_data = pd.read_csv("./data/test.csv")
    
    # split training data into text and targets
    text = train_data.text.values.tolist()
    y = torch.tensor(train_data.target)
    
    # shrink data
    text = text[:100]
    y = y[:100]
    
    # preprocess text
#     preprocessed_text = preprocess_text(text)
    preprocessed_text = text
    
    # get deberta tokenizer
    tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
    
    # encode text
    X = encode_text(preprocessed_text, tokenizer, max_len)
    
    # check if cuda is available else use cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # create an instance of deberta model
    model = Deberta_layer().to(device)

    # add adamW optimizer
    optimizer = torch.optim.AdamW(
        params=model.parameters(),
        lr=lr,
        betas=betas,
        eps=eps
    )
    
    # get stratified splitter
    stratified_split = StratifiedShuffleSplit(
        n_splits=n_splits,
        test_size=test_size,
        random_state=random_state
    )
    
    # set best averaged validation loss and f1 score
    best_averaged_val_accuracy = 0
    best_averaged_val_loss = 1e6
    best_averaged_val_f1_score = 0
    
    # iterate over strata
    for strata, (train_index, test_index) in enumerate(stratified_split.split(X, y)):
        
        # split data
        X_train = X[train_index,:]
        X_test = X[test_index,:]
        
        # split targets
        y_train = y[train_index]
        y_test = y[test_index]
        
        # reshape targets to make dataloader
        y_train_reshaped = y_train.reshape((y_train.shape[0],1))
        y_test_reshaped = y_test.reshape((y_test.shape[0], 1))
        
        # make training dataloader
        train_dataloader = DataLoader(
            list(zip(X_train, y_train_reshaped)),
            batch_size = batch_size,
            shuffle = True,
#             sampler=,
#             batch_sampler = ,
            num_workers = 0
        )
        
        # get linear learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer = optimizer,
            num_warmup_steps = 0,
            num_training_steps = len(train_dataloader) * n_epochs
        )
        
        # indicate training
        model.train()
                
        # train model over strata
        for epoch in range(n_epochs):
            
            # time each epoch
            t0 = time.time()
            
            # track loss and f1
            total_train_loss = 0
            total_f1_score = 0
            
            # batch number
            batch = 0
            
            for X_batch, y_batch in train_dataloader:
                
                # print batch number
                print(f"Batch {batch+1}")
                batch += 1
                
                # zero gradient
                optimizer.zero_grad()
                
                # reshape data and targets for model
                tuple_ids = X_batch[:,:max_len]
                attention_masks = X_batch[:,max_len:]
                labels = y_batch.flatten().to(float)
                
                # add to device
                tuple_ids = tuple_ids.to(device)
                attention_masks = attention_masks.to(device)
                labels = labels.to(device)
                
                # get loss value and prediction
                loss, logits = model(tuple_ids, attention_masks, labels)
                
                # # print
                # print(f"length: {len(train_dataloader)}")
                # print(tuple_ids)
                # print(attention_masks)
                # print(labels)
                # print(logits)
                
                # add train loss
                total_train_loss += loss
                
                # get prediction
                y_batch_pred = (logits.flatten() < 0).to(float)
                
                # detach computational graph, copy to cpu, make numpy array
                y_batch_pred = y_batch_pred.detach().cpu().numpy()
                labels = labels.detach().cpu().numpy()                
                
                # calculate weighted f1 score of prediction
                total_f1_score += f1_score(labels, y_batch_pred, average="weighted")
                
                # accumulate gradient
                loss.backward()
                
                # update parameters
                optimizer.step()
                
                # update learning rate
                scheduler.step()
            
            # gather data
            average_train_loss = total_train_loss / len(train_dataloader)
            average_f1_score = total_f1_score / len(train_dataloader)
             
            # print results
            print(f"Epoch: {epoch + 1}/{n_epochs}")   
            print(f"Averaged train loss: {average_train_loss}")
            print(f"Averaged f1 score: {average_f1_score}")
            print(f"Training time: {datetime.timedelta(seconds = time.time()-t0)}\n")
            
            
            
            
            # indicate testing
            model.eval()
            
            # make testing dataloader
            test_dataloader = DataLoader(
                list(zip(X_test, y_test_reshaped)),
                batch_size = batch_size,
                shuffle = False,
    #             sampler=,
    #             batch_sampler =,
                num_workers = 0
            )
            
            # track validation loss and f1
            total_val_train_loss = 0
            total_val_f1_score = 0
            
            # batch number
            batch = 0
            
            # disable gradient computation and reduce memory consumption
            with torch.no_grad():
                
                for _, (X_val, y_val) in enumerate(test_dataloader):
                                        
                    # print batch number
                    print(f"Batch {batch + 1}")
                    batch += 1
                    
                    # print length
                    print(len(test_dataloader))
                    
                    # reshape data and targets for model
                    val_tuple_ids = X_val[:,:max_len]
                    val_attention_masks = X_val[:,max_len:]
                    val_labels = y_val.flatten().to(float)

                    # add to device
                    val_tuple_ids = val_tuple_ids.to(device)
                    val_attention_masks = val_attention_masks.to(device)
                    val_labels = val_labels.to(device)

                    # get loss value and prediction
                    val_loss, val_logits = model(val_tuple_ids, val_attention_masks, val_labels)
                    
                    # add train loss
                    total_val_train_loss += val_loss

                    # get prediction
                    y_val_batch_pred = (val_logits.flatten() < 0).to(float)
                
                
                    # detach computational graph, copy to cpu, make numpy array
                    y_val_batch_pred = y_val_batch_pred.detach().cpu().numpy()
                    val_labels = val_labels.detach().cpu().numpy()                

                    # calculate weighted f1 score of prediction
                    total_val_f1_score += f1_score(val_labels, y_val_batch_pred, average="weighted")
                    
                # gather validation data
                average_val_train_loss = total_val_train_loss / len(test_dataloader)
                average_val_f1_score = total_val_f1_score / len(test_dataloader)

                # print results
                print(f"Averaged validation loss: {average_val_train_loss}")
                print(f"Averaged validation f1 score: {average_val_f1_score}")
                
            
            
                # track best performance and save the model's state
                if average_val_f1_score > best_averaged_val_f1_score:

                    # update best scores
                    best_averaged_val_loss = average_val_train_loss
                    best_averaged_val_f1_score = average_val_f1_score

                    # check if file for data exists and create if does not
                    os.makedirs("model", exist_ok=True)

                    # save path
                    model_path = os.path.join("model", "model.pth")

                    # save model
                    torch.save(model.state_dict(), model_path)
            
            
            
            
            
                

In [212]:
make_trained_model(lr, test_size, n_splits, n_epochs, batch_size, max_len, betas, eps, random_state)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Epoch: 1/10
Averaged train loss: 0.7806434039366976
Averaged f1 score: 0.26499247129681913
Training time: 0:15:45.164669

Batch 1
2
Batch 2
2
Averaged validation loss: 0.3874912503958187
Averaged validation f1 score: 0.2525
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Epoch: 2/10
Averaged train loss: 0.271328942024449
Averaged f1 score: 0.3248181818181818
Training time: 0:08:54.063920

Batch 1
2
Batch 2
2
Averaged validation loss: 0.24234015106559903
Averaged validation f1 score: 0.2525
Batch 1
Batch 2
Batch 3


KeyboardInterrupt: 

In [238]:
# from torch.utils.data import TensorDataset

tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
# train = pd.read_csv("./data/train.csv")
# etext = encode_text(train.text, tokenizer, 50)
# a = np.reshape(np.array(train.target), (train.target.size,-1))
# b = np.hstack((etext, a))



# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DebertaForSequenceClassification.from_pretrained(
#             "microsoft/deberta-base",     # base model
#             num_labels = 1,               # number of outputs
#             output_attentions = False,    # returns attention weights of all layers
#             output_hidden_states = False  # returns hidden states of all layers
#         )
# model.to(device)
t = tokenizer(train.text[0])
input_ids = torch.tensor(np.array(t["input_ids"]))
att_mask = torch.tensor(np.array(t["attention_mask"]))
labels=torch.FloatTensor(np.array(train.target[0]))

input_ids = input_ids.reshape(input_ids.shape[0],1)
att_mask = att_mask.reshape(att_mask.shape[0],1)
labels = labels.reshape(labels.shape,1)
# model(input_ids, token_type_ids=None, attention_mask=att_mask, labels=labels)





device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Deberta_layer().to(device)
model(input_ids, att_mask, labels)


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.bias', 'classifi

(tensor(8.1070e+25, grad_fn=<MseLossBackward0>),
 tensor([ 0.2575, -0.0540,  0.0064, -0.2324, -0.1263, -0.0420, -0.0859,  0.0361,
         -0.2909, -0.0551, -0.0938, -0.1085, -0.2714, -0.0300, -0.0789, -0.0170,
         -0.2067, -0.0134, -0.1068, -0.0769,  0.2575], grad_fn=<ViewBackward0>))

In [548]:
from torch.utils.data import TensorDataset

tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
train = pd.read_csv("./data/train.csv")
t = encode_text(train.text[:1], tokenizer, 50)
labels = train.target[:1]



input_ids = torch.tensor(t[:,:50])
att_mask = torch.tensor(t[:,50:])
labels = torch.tensor(labels).to(float).reshape(labels.shape, 1)
print(input_ids)
print(att_mask)
print(labels)




# input_ids = input_ids.reshape(input_ids.shape[0], 1)
# att_mask = att_mask.reshape(att_mask.shape[0], 1)
labels = labels.reshape(labels.shape[0], 1)





device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Deberta_layer().to(device)
print(model(input_ids, att_mask, labels))


# input_ids = []
# att_masks = []
# for text in train.text[:2]:
    

new_model = DebertaForSequenceClassification.from_pretrained(
            "microsoft/deberta-base",     # base model
            num_labels = 1,               # number of outputs
            output_attentions = False,    # returns attention weights of all layers
            output_hidden_states = False  # returns hidden states of all layers
        )

new_input_ids = torch.tensor(t[0,:50]).reshape(1,50)
new_masks = torch.tensor(t[0,50:]).reshape(1,50)
new_label = train.target[:1]
new_label = torch.tensor(new_label).to(float).reshape(new_label.shape, 1)

print(new_input_ids)
print(new_masks)
print(new_label)


print(new_model(new_input_ids, token_type_ids=None, attention_mask=new_masks, labels=new_label))
print("\n")

texts = train.text.values.tolist()

other_input_ids = []
other_masks = []
for text in texts:
    encoded_text = tokenizer.encode_plus(
        text=text,
        add_special_tokens=True,
        max_length=10,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt"
    )
    
    other_input_ids.append(encoded_text.get("input_ids"))
    other_masks.append(encoded_text.get("attention_mask"))

other_input_ids = torch.cat(other_input_ids, dim=0)
other_masks = torch.cat(other_masks, dim=0)

other_label = torch.tensor(train.target.values.tolist())

print(other_input_ids)
print(other_masks)
print(other_label)

dataset = TensorDataset(
    other_input_ids,
    other_masks,
    other_label
)

dataloader = DataLoader(dataset, batch_size=32, num_workers=0, shuffle=True)

for batch in dataloader:
    other_input_ids, other_masks, other_label = batch
    other_label = other_label.reshape((other_label.shape[0], 1))
    other_label = other_label.float()
    print(new_model(other_input_ids, token_type_ids=None, attention_mask=other_masks, labels=other_label))



# other_label = torch.tensor(train.target[:1])
# other_label = other_label.reshape((1, 1))
# other_label = other_label.float()
# print(other_input_ids)
# print(other_masks)
# print(other_label)
# new_model(other_input_ids, token_type_ids=None, attention_mask=other_masks, labels=other_label)

tensor([[    1,  2522,   926, 12080,    32,     5, 31613,     9,    42,   849,
         25581,  2253,  5113,   392, 12389, 15334,   286, 26650,   201,    70,
             2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       dtype=torch.int32)
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]], dtype=torch.int32)
tensor([1.], dtype=torch.float64)


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier

(tensor(1.0551, dtype=torch.float64, grad_fn=<MseLossBackward0>), tensor([-0.0272], dtype=torch.float64, grad_fn=<ToCopyBackward0>))


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier

tensor([[    1,  2522,   926, 12080,    32,     5, 31613,     9,    42,   849,
         25581,  2253,  5113,   392, 12389, 15334,   286, 26650,   201,    70,
             2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]],
       dtype=torch.int32)
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]], dtype=torch.int32)
tensor([1.], dtype=torch.float64)
SequenceClassifierOutput(loss=tensor(0.9894, dtype=torch.float64, grad_fn=<MseLossBackward0>), logits=tensor([0.0053], dtype=torch.float64, grad_fn=<ToCopyBackward0>), hidden_states=None, attentions=None)


tensor([[    1,  2522,   926,  ...,     9,    42,     2],
        [    1, 42542,   668,  ..., 15531,     4,     2],
        [    1,  3684,

KeyboardInterrupt: 

In [458]:
a = torch.tensor([1, 2, 3])
(a <= 2).to(float)

tensor([1., 1., 0.], dtype=torch.float64)

In [476]:
!pip install --upgrade transformers

Collecting transformers
  Using cached transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Using cached transformers-4.40.1-py3-none-any.whl (9.0 MB)
Using cached tokenizers-0.19.1-cp310-none-win_amd64.whl (2.2 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.12.1
    Uninstalling tokenizers-0.12.1:
      Successfully uninstalled tokenizers-0.12.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.20.1
    Uninstalling transformers-4.20.1:
      Successfully uninstalled transformers-4.20.1
Successfully installed tokenizers-0.19.1 transformers-4.40.1


  You can safely remove it manually.


In [None]:
# set parameters
class params():
    
    def __init__(
        self,
        device,
        lr = 2e-5,
        test_size = 0.2,
        n_splits = 5,
        n_epochs = 100,
        batch_size = 32,
        max_len = 100,
        betas = (0.9, 0.98),
        eps = 1e-8,
        random_state = 42
    ) -> None:
        self.device = device
        self.lr = lr
        self.test_size = test_size
        self.n_splits = n_splits
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.max_len = max_len
        self.betas = betas
        self.eps = eps
        self.random_state = random_state


# store best statistics
class best_stats(make_model):
    def __init__(
        self,
        best_averaged_train_accuracy = 0,
        best_averaged_train_loss = 1e6,
        best_averaged_train_f1_score = 0,
        best_averaged_val_accuracy = 0,
        best_averaged_val_loss = 1e6,
        best_averaged_val_f1_score = 0
    ) -> None:
        self.best_average_train_accuracy = best_averaged_train_accuracy
        self.best_average_train_loss = best_averaged_train_loss
        self.best_average_train_f1_score = best_averaged_train_f1_score
        self.best_average_val_accuracy = best_averaged_val_accuracy
        self.best_average_val_loss = best_averaged_val_loss
        self.best_average_val_f1_score = best_averaged_val_f1_score
        
    def __str__(self):
        
        # get attributes
        attrs = vars(self)
        
        return "\n".join("%s: %s" % item for item in attrs.items())
        
    def record_best(
        self,
        model: make_model,
        average_train_accuracy: float,
        average_train_loss: float,
        average_train_f1_score: float,
        average_val_accuracy: float,
        average_val_loss: float,
        average_val_f1_score: float
    ) -> None:
        
        # track best performance and save the model's state
        if average_val_f1_score > self.best_average_val_f1_score:

            # update best model scores
            self.best_average_train_accuracy = average_train_accuracy
            self.best_average_train_loss = average_train_loss
            self.best_average_train_f1_score = average_train_f1_score
            self.best_average_val_accuracy = average_val_accuracy
            self.best_average_val_loss = average_val_loss
            self.best_average_val_f1_score = average_val_f1_score

            # check if file for data exists and create if does not
            os.makedirs("model", exist_ok=True)

            # save path
            model_path = os.path.join("model", "model.pth")

            # save model
            torch.save(model.model.state_dict(), model_path)
       
    
# create model
class make_model(params):
    
    def __init__(self):
        self.model = Deberta_layer().to(device)
        
    def train(
        self,
        dataloader: torch.utils.data.DataLoader,
        optimizer: torch.optim.AdamW,
        scheduler: get_linear_schedule_with_warmup
    ) -> Tuple[float, float, float]:
        
        # time each training epoch
        t0 = time.time()

        # track accuracy, loss, and f1
        total_train_accuracy = 0
        total_train_loss = 0
        total_train_f1_score = 0

        # batch number
        batch = 0
        
        # indicate training
        self.model.train()
        
        for X_batch, y_batch in dataloader:
            
            # print batch number
            print(f"Batch {batch+1} / {len(dataloader)}")
            batch += 1

            # zero gradient
            optimizer.zero_grad()

            # reshape data and targets for model
            tuple_ids = X_batch[:,:params.max_len]
            attention_masks = X_batch[:,params.max_len:]
            labels = y_batch.flatten().to(float)

            # add to device
            tuple_ids = tuple_ids.to(params.device)
            attention_masks = attention_masks.to(params.device)
            labels = labels.to(params.device)

            # get loss value and prediction
            loss, logits = self.model(tuple_ids, attention_masks, labels)

            # add train loss
            total_train_loss += loss.item()

            # get prediction
            y_batch_pred = (torch.argmax(logits, axis=1).flatten() > 0.5).to(float)

            # detach computational graph, copy to cpu, make numpy array
            y_batch_pred = y_batch_pred.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            
            # compute training accuracy
            total_train_accuracy += np.sum(y_batch_pred == labels) / len(labels)

            # calculate weighted f1 score of prediction
            total_train_f1_score += f1_score(labels, y_batch_pred, average="weighted")

            # accumulate gradient
            loss.backward()

            # update parameters
            optimizer.step()

            # update learning rate
            scheduler.step()

        # gather data
        average_train_accuracy = total_train_accuracy / len(dataloader)
        average_train_loss = total_train_loss / len(dataloader)
        average_train_f1_score = total_train_f1_score / len(dataloader)

        # print results
        print(f"Averaged train accuracy: {average_train_accuracy}")
        print(f"Averaged train loss: {average_train_loss}")
        print(f"Averaged f1 score: {average_train_f1_score}")
        print(f"Training time: {datetime.timedelta(seconds = time.time()-t0)}\n")

        return average_train_accuracy, average_train_loss, average_train_f1_score



    def test(self, dataloader: torch.utils.data.DataLoader) -> Tuple[float, float, float]:

        # batch number
        batch = 0
        
        # track validation accuracy, validation loss, and f1
        total_val_accuracy = 0
        total_val_loss = 0
        total_val_f1_score = 0
        
        # indicate testing
        self.model.eval()
        
        # disable gradient computation and reduce memory consumption
        with torch.inference_mode():

            for X_val, y_val in dataloader:

                # print batch number
                print(f"Batch {batch + 1} / {len(dataloader)}")
                batch += 1

                # reshape data and targets for model
                val_tuple_ids = X_val[:,:params.max_len]
                val_attention_masks = X_val[:,params.max_len:]
                val_labels = y_val.flatten().to(float)
                
                # add to device
                val_tuple_ids = val_tuple_ids.to(params.device)
                val_attention_masks = val_attention_masks.to(params.device)
                val_labels = val_labels.to(params.device)

                # get loss value and prediction
                val_loss, val_logits = self.model(val_tuple_ids, val_attention_masks, val_labels)

                # add train loss
                total_val_loss += val_loss.item()

                # get prediction
                y_val_batch_pred = (torch.argmax(val_logits, axis=1).flatten() > 0.5).to(float)

                # detach computational graph, copy to cpu, make numpy array
                y_val_batch_pred = y_val_batch_pred.detach().cpu().numpy()
                val_labels = val_labels.detach().cpu().numpy()
                
                # calculate accuracy
                total_val_accuracy += np.sum(y_val_batch_pred == val_labels) / len(val_labels)

                # calculate weighted f1 score of prediction
                total_val_f1_score += f1_score(val_labels, y_val_batch_pred, average="weighted")
                
        # gather validation data
        average_val_accuracy = total_val_accuracy / len(dataloader)
        average_val_loss = total_val_loss / len(dataloader)
        average_val_f1_score = total_val_f1_score / len(dataloader)
        
        # print results
        print(f"Validation accuracy: {average_val_accuracy}")
        print(f"Averaged validation loss: {average_val_loss}")
        print(f"Averaged validation f1 score: {average_val_f1_score}\n")
        
        return average_val_accuracy, average_val_loss, average_val_f1_score
    
    

        
                
                
                


# check if cuda is available else use cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set parameters
params = params(device = device)
        
# read in data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

# split training data into text and targets
text = train_data.text.values.tolist()
y = torch.tensor(train_data.target)

# shrink data
text = text[:1000]
y = y[:1000]

# preprocess text
preprocessed_text = preprocess_text(text)

# get deberta tokenizer
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# encode text
X = encode_text(preprocessed_text, tokenizer, params.max_len)

# create an instance of deberta model
test_model = make_model()

# add adamW optimizer
optimizer = torch.optim.AdamW(
    params=test_model.model.parameters(),
    lr=params.lr,
    betas=params.betas,
    eps=params.eps
)

# get stratified splitter
# stratified_split = StratifiedShuffleSplit(
#     n_splits=params.n_splits,
#     test_size=params.test_size,
#     random_state=params.random_state
# )

# record best model stats
model_best_stats = best_stats()

# iterate over strata
# for strata, (train_index, test_index) in enumerate(stratified_split.split(X, y)):
    
    # # print strata
    # print(f"Strata: {strata + 1}\n")

#     # split data
#     X_train = X[train_index,:]
#     X_test = X[test_index,:]

#     # split targets
#     y_train = y[train_index]
#     y_test = y[test_index]





# create dataset
dataset = list(zip(X, y.reshape(y.shape[0],1)))

# set sizes
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

# split data
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# make training dataloader
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = params.batch_size
)

# make testing dataloader
test_dataloder = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = params.batch_size
)
    




    # # reshape targets to make dataloader
    # y_train_reshaped = y_train.reshape((y_train.shape[0],1))
    # y_test_reshaped = y_test.reshape((y_test.shape[0], 1))


#     # make training dataloader
#     train_dataloader = DataLoader(
#         list(zip(X_train, y_train_reshaped)),
#         batch_size = params.batch_size,
#         shuffle = True,
#         num_workers = 0
#     )

#     # make testing dataloader
#     test_dataloader = DataLoader(
#         list(zip(X, y_test_reshaped)),
#         batch_size = params.batch_size,
#         shuffle = False,
#         num_workers = 0
#     )

# get linear learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(train_dataloader) * params.n_epochs
)
    
    # train model over strata
for epoch in range(n_epochs):

    # print epoch
    print(f"Epoch: {epoch + 1}/{n_epochs}")

    # train model
    average_train_accuracy, average_train_loss, average_train_f1_score = test_model.train(train_dataloader, optimizer, scheduler)

    # test model
    average_val_accuracy, average_val_loss, average_val_f1_score = test_model.test(test_dataloader)

    # record best model
    model_best_stats.record_best(
        test_model,
        average_train_accuracy,
        average_train_loss,
        average_train_f1_score,
        average_val_accuracy,
        average_val_loss,
        average_val_f1_score
    )

    print(model_best_stats)
    print("\n")





























# # read in data
# train_data = pd.read_csv("./data/train.csv")
# test_data = pd.read_csv("./data/test.csv")

# # split training data into text and targets
# text = train_data.text.values.tolist()
# y = torch.tensor(train_data.target)

# # shrink data
# text = text[:100]
# y = y[:100]

# # preprocess text
# preprocessed_text = text

# # get deberta tokenizer
# tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# # encode text
# X = encode_text(preprocessed_text, tokenizer, max_len)

# # reshape targets to make dataloader
# y_test_reshaped = y.reshape((y.shape[0], 1))

# # indicate testing
# test_model.eval()
            
# # make testing dataloader
# test_dataloader = DataLoader(
# list(zip(X, y_test_reshaped)),
# batch_size = batch_size,
# shuffle = False,
# num_workers = 0
# )
        
# average_val_accuracy, average_val_train_loss, average_val_f1_score = test_model.test(test_dataloader)

# # print results
# print(f"Validation accuracy: {average_val_accuracy}")
# print(f"Averaged validation loss: {average_val_train_loss}")
# print(f"Averaged validation f1 score: {average_val_f1_score}")


# # set best averaged validation loss and f1 score
# best_averaged_val_accuracy = 0
# best_averaged_val_loss = 1e6
# best_averaged_val_f1_score = 0

# test_model.record_best(
#     best_averaged_val_accuracy,
#     best_averaged_val_loss,
#     best_averaged_val_f1_score,
#     average_val_accuracy,
#     average_val_train_loss,
#     average_val_f1_score
# )

    


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 1/10
Batch 1 / 25
Batch 2 / 25
Batch 3 / 25
Batch 4 / 25
Batch 5 / 25
Batch 6 / 25
Batch 7 / 25
Batch 8 / 25
Batch 9 / 25
Batch 10 / 25
Batch 11 / 25
Batch 12 / 25
Batch 13 / 25
Batch 14 / 25
Batch 15 / 25
Batch 16 / 25
Batch 17 / 25
Batch 18 / 25
Batch 19 / 25
Batch 20 / 25
Batch 21 / 25
Batch 22 / 25
Batch 23 / 25
Batch 24 / 25
Batch 25 / 25
Averaged train accuracy: 0.69625
Averaged train loss: 0.5624345743656158
Averaged f1 score: 0.6153133735001934
Training time: 0:07:54.187415

Batch 1 / 2
Batch 2 / 2
Validation accuracy: 0.6138392857142857
Averaged validation loss: 0.8914835155010223
Averaged validation f1 score: 0.5677480082942268

best_average_train_accuracy: 0.69625
best_average_train_loss: 0.5624345743656158
best_average_train_f1_score: 0.6153133735001934
best_average_val_accuracy: 0.6138392857142857
best_average_val_loss: 0.8914835155010223
best_average_val_f1_score: 0.5677480082942268


Epoch: 2/10
Batch 1 / 25
Batch 2 / 25
Batch 3 / 25
Batch 4 / 25
Batch 5 / 25
Batc