In [None]:
!conda install -y gdown

# !gdown --id 1JxsxrJsXCjfLhaSP4nJd4VX0NL1-BgAq
# !gdown --id 1zRn-IXvWuzCqb87F2lxSUoyGdscMUjUW
!gdown --id 1MrCOyR5NZxgn5Dfdkl3CJEoau1xsA9ab        #sixM dataset
!gdown --id 1Y3yzenxiOOdYesEm8D9NELfFCDIyulW8        #checkpoint

In [None]:
# imports
import os
import numpy as np
import pandas as pd
import csv
from tqdm.notebook import tqdm
import re
import time
import joblib

import torch
import transformers

In [None]:
#### configurations
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALIDATION_BATCH_SIZE = 4
EPOCHS = 1
ACCUMULATION = 2
BERT_PATH = "../input/bert-base-uncased/"
MODEL_PATH = "../input/bert-base-uncased/"
OUTPUT_PATH = "checkpoint3.pt"
SAVED_MODEL_PATH = "checkpoint1(1).pt"
# TRAINING_FILE = "/content/dataset/train.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH, 
    do_lower_case = True
)

In [None]:
#data_loader

#https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
class BERTDataset:
    def __init__(self , sentence , target):
        """
            sentence : list of strings(sentences)
            target : list of ints
        """
        self.sentence = sentence
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN

    #total len of dataset
    def __len__(self):
        return len(self.sentence)

    def __getitem__(self , idx):
        sentence = str(self.sentence[idx])   #just to make sure everything is string and not ints or UTF
        sentence = " ".join(sentence.split())

        #tokeizing the sentences
        inputs = self.tokenizer.encode_plus(
            text = sentence,
            add_special_tokens = True,
            max_length = self.max_len,
            padding='max_length',
            truncation = True
            # return_attention_mask = True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        # print(f"inputs {len(ids) } , mask {len(mask)}  len {len(sentence.split())}  target {self.target[idx]}")

        return {
            'ids' : torch.tensor(ids  , dtype = torch.long),
            'mask' : torch.tensor(mask , dtype = torch.long),
            'targets' : torch.tensor(self.target[idx] , dtype = torch.long)
        }

        


In [None]:
#model
# import transformers
import torch.nn as nn

class BERTBaseUncased(nn.Module):
    def __init__(self, target_size):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(768 , target_size)   #change 1 to number of intnents and also add actication functions
        self.soft = nn.Softmax()

    def forward(self, ids , mask):
        #out1 = (batch_size, sequence_length, 786) – Sequence of hidden-states at the output of the last layer of the model.
        #out2 = (batch_size, 786) – Last layer hidden-state of the first token of the sequence (classification token) (?? not sure what this is)
        #                         – Gives a vector of size 768 for each sample in batch
        #https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        _ , out2 = self.bert(
            input_ids = ids,
            attention_mask = mask,
            return_dict=False
            # token_type_ids = token_type_ids     #not sure if it's necessary for this task
        )

        bert_output = self.bert_drop(out2)
        output = self.out(bert_output)
        # soft_out = self.soft(output)
        return output 

In [None]:
#engine
# !pip install tqdm 
from tqdm.notebook import tqdm

def loss_fn(outputs , targets):
    # print("outputs : " , outputs)
    # print("targets : ", targets)
    return nn.CrossEntropyLoss()(outputs , targets)

def train_fn(data_loader , model , optimizer , device , accumulation_steps,schedular):
    model.train()
    final_loss = 0
    #loop through each batch
    for batch_index , data_batch in tqdm(enumerate(data_loader) , total = len(data_loader)):
        ids = data_batch["ids"]
        mask =  data_batch["mask"]
        targets = data_batch["targets"]

        ids = ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        targets = targets.to(device, dtype = torch.long)

        optimizer.zero_grad()
        outputs = model(
            ids = ids,
            mask = mask
        )

        # print(f"inputs {len(ids) } , mask {len(mask)}  target {targets.shape}")
        loss = loss_fn(outputs, targets)
        loss.backward()
#         print(f"loss = {loss.item()}")
        final_loss += loss.item()
        # optimizer.step()
        # schedular.step()     

        if (batch_index + 1) % accumulation_steps == 0:
            optimizer.step()
            schedular.step()
            
    return final_loss / len(data_loader)


def eval_fn(data_loader , model, device):
    model.eval()
    final_targets = []
    final_outputs = []
    final_loss = 0
    
    #loop through each batch
    with torch.no_grad():   #??
        for batch_index , data_batch in tqdm(enumerate(data_loader) , total = len(data_loader)):
            ids = data_batch['ids']
            mask =  data_batch['mask']
            targets = data_batch['targets']

            ids = ids.to(device, dtype = torch.long)
            mask = mask.to(device, dtype = torch.long)
            targets = targets.to(device, dtype = torch.long)

            outputs = model(
                ids = ids,
                mask = mask
            )
            
            loss = loss_fn(outputs, targets)
            final_loss += loss.item()
            
            # print("eval output" , outputs)
            final_targets.extend(targets.cpu().detach().numpy().tolist())
            final_outputs.extend(outputs.cpu().detach().numpy().argmax(axis = 1).tolist())    #change this in case of multiple outputs

    return final_outputs , final_targets, final_loss / len(data_loader)

In [None]:
#train
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder
from transformers import get_linear_schedule_with_warmup
import csv
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def run(df):
#     df = pd.read_csv(TRAINING_FILE ,escapechar = "\\" , quoting = csv.QUOTE_NONE).fillna("none")
#     df = df.sample(n = 20000)
#     print(df.head())

#     key = df['BROWSE_NODE_ID'].value_counts()
#     for index, row in tqdm(df.iterrows(), total = df.shape[0]):
#         if(key[row['BROWSE_NODE_ID']] == 1):
#             df.drop(index, inplace = True)

#     le = LabelEncoder()
#     df['BROWSE_NODE_ID'] = le.fit_transform(df['BROWSE_NODE_ID'])
#     le_name_mapping = dict(zip( le.transform(le.classes_) , le.classes_))

    df_train, df_valid = model_selection.train_test_split(
        df,
        test_size = 0.1,
        random_state = 2000,
        stratify = df.BROWSE_NODE_ID.values
    )

    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)

    # df_train = df_train.sample(n = 10000)
    # df_test = df_test.sample(n = 1000)

    train_dataset = BERTDataset(
        sentence = df_train.text.values , 
        target = df_train.BROWSE_NODE_ID.values
    )

    valid_dataset = BERTDataset(
        sentence = df_valid.text.values , 
        target = df_valid.BROWSE_NODE_ID.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset ,
        batch_size = TRAIN_BATCH_SIZE, 
        num_workers = 1
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset ,
        batch_size = VALIDATION_BATCH_SIZE, 
        num_workers = 1
    )
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("There are %s GPU's." %torch.cuda.device_count())
        print("GPU Name: " , torch.cuda.get_device_name(0))

    else:
        print("No GPU's Available :(")
        decive = torch.device("cpu")
    
    model = BERTBaseUncased(df['BROWSE_NODE_ID'].nunique())
    model.to(device)
#     params = (list(model.named_parameters()))
#     for p in params:
#         print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
 
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]   

    num_train_steps = len(df_train)/TRAIN_BATCH_SIZE * EPOCHS
    optimizer = AdamW(
        optimizer_parameters,
        lr = 2e-5
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )
    
    checkpoint = torch.load(SAVED_MODEL_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#     epoch = checkpoint['epoch']
#     loss = checkpoint['loss']
    
    training_stats = []
    best_accuracy = 0
    for epoch in range(EPOCHS):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
        print('Training...')
        t0 = time.time()
        
        avg_train_loss = train_fn(train_data_loader , model , optimizer , device , ACCUMULATION , scheduler)
        
        training_time = format_time(time.time() - t0)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
    
        print("")
        print("Running Validation...")
        t0 = time.time()
        
        
        outputs , targets , avg_val_loss = eval_fn(valid_data_loader , model, device )
        accuracy = metrics.accuracy_score(targets , outputs)
        
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))
        print("  Validation accuracy: {:}".format(accuracy))
        
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
        
        torch.save({
            'epoch': 4,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_val_loss,
            'acc' : accuracy
            }, OUTPUT_PATH)
        
        if(accuracy > best_accuracy):
#             torch.save(model.state_dict(), OUTPUT_PATH)
            print(f"Accuracy Score = {accuracy}")
            best_accuracy = accuracy
            
    return training_stats


In [None]:
# #load data
# df = pd.read_pickle('final_df')

# # df.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'] ,axis = 1,  inplace = True)
# df.head()

In [None]:

# len(df)

In [None]:
# # def drop_sparse_classes(df):
# #     unique_labels, label_counts = np.unique(df.BROWSE_NODE_ID, return_counts=True)
# #     drop_labels = unique_labels[label_counts < 10]
# #     _df = df.apply(lambda x: x['BROWSE_NODE_ID'] in unique_labels[label_counts < 10], axis = 1)
# #     df_drop = df[_df]
# #     return df_drop

# def preprocess(df):
#     key = df['BROWSE_NODE_ID'].value_counts()
#     print("No. of labels having only one sample : ", key.value_counts()[1])
    
#     #So, we will have to remove those samples
#     for index, row in tqdm(df.iterrows(), total = df.shape[0]):
#         if(key[row['BROWSE_NODE_ID']] == 1):
#             df.drop(index, inplace = True)

In [None]:
# preprocess(df)
# # df = drop_sparse_classes(df)

In [None]:
# df.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'] ,axis = 1,  inplace = True)
# print(len(df))
# print(len(df['BROWSE_NODE_ID'].value_counts()))

In [None]:
# _ , df_sixM = model_selection.train_test_split(
#         df,
#         test_size = 0.175,
#         random_state = 2000,
#         stratify = df.BROWSE_NODE_ID.values
#     )

In [None]:
# preprocess(df_sixM)

In [None]:
# print(len(df_sixM))
# print(len(df_sixM['BROWSE_NODE_ID'].value_counts()))
# df_sixM.head()

In [None]:
# # !pip install joblib
# import joblib

In [None]:
# def encoding(df):
#     le = LabelEncoder()
#     df['BROWSE_NODE_ID'] = le.fit_transform(df['BROWSE_NODE_ID'])
#     decoder = dict(zip( le.transform(le.classes_) , le.classes_))
#     return decoder

In [None]:
# decoder = encoding(df_sixM)

In [None]:
# joblib.dump(decoder , 'decoder.joblib')

In [None]:
# df_sixM.to_pickle('df_sixM')

In [None]:
df_sixM = pd.read_pickle('df_sixM')
df_sixM.head()

In [None]:
training_stats = run(df_sixM)

In [None]:
training_stats

In [None]:
!pip install --upgrade IPython

In [None]:
from IPython import display

In [None]:
display.FileLink(r'checkpoint1.pt')