In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 6.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 73.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.8 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [None]:
# imports
import os
import numpy as np
import pandas as pd
import csv
from tqdm.notebook import tqdm
import re
import time
import joblib

import torch
import transformers

In [None]:
#configurations
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALIDATION_BATCH_SIZE = 4
EPOCHS = 1
ACCUMULATION = 2
BERT_PATH = "/gdrive/MyDrive/BERT-uncased"
MODEL_PATH = "/gdrive/MyDrive/BERT-uncased"
# OUTPUT_PATH = "/gdrive/MyDrive/amazon_ml_2021/checkpoint1(1).pt"
OUTPUT_PATH = "/gdrive/MyDrive/amazon_ml_2021/cleaned/checkpoint_clean1.pt"
# TRAINING_FILE = "/content/dataset/train.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH, 
    do_lower_case = True
)

In [None]:
#data_loader

#https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
class BERTDataset:
    def __init__(self , sentence , product_id):
        """
            sentence : list of strings(sentences)
            target : list of ints
        """
        self.sentence = sentence
        # self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
        self.product_id = product_id

    #total len of dataset
    def __len__(self):
        return len(self.sentence)

    def __getitem__(self , idx):
        sentence = str(self.sentence[idx])   #just to make sure everything is string and not ints or UTF
        sentence = " ".join(sentence.split())

        #tokeizing the sentences
        inputs = self.tokenizer.encode_plus(
            text = sentence,
            add_special_tokens = True,
            max_length = self.max_len,
            padding='max_length',
            truncation = True
            # return_attention_mask = True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        # print(f"inputs {len(ids) } , mask {len(mask)}  len {len(sentence.split())}  target {self.target[idx]}")

        return {
            'ids' : torch.tensor(ids  , dtype = torch.long),
            'mask' : torch.tensor(mask , dtype = torch.long),
            # 'targets' : torch.tensor(self.target[idx] , dtype = torch.long),
            'product_ids' : torch.tensor(self.product_id[idx] , dtype = torch.long)
        }

        


In [None]:
#model
# import transformers
import torch.nn as nn

class BERTBaseUncased(nn.Module):
    def __init__(self, target_size):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_PATH)
        self.bert_drop = nn.Dropout(p = 0.3)
        self.out = nn.Linear(768 , target_size)   #change 1 to number of intnents and also add actication functions
        self.soft = nn.Softmax()

    def forward(self, ids , mask):
        #out1 = (batch_size, sequence_length, 786) – Sequence of hidden-states at the output of the last layer of the model.
        #out2 = (batch_size, 786) – Last layer hidden-state of the first token of the sequence (classification token) (?? not sure what this is)
        #                         – Gives a vector of size 768 for each sample in batch
        #https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        _ , out2 = self.bert(
            input_ids = ids,
            attention_mask = mask,
            return_dict=False
            # token_type_ids = token_type_ids     #not sure if it's necessary for this task
        )

        bert_output = self.bert_drop(out2)
        output = self.out(bert_output)
        # soft_out = self.soft(output)
        return output 

In [None]:
#engine
# !pip install tqdm 
from tqdm.notebook import tqdm

def loss_fn(outputs , targets):
    # print("outputs : " , outputs)
    # print("targets : ", targets)
    return nn.CrossEntropyLoss()(outputs , targets)


def test_fn(data_loader , model, device):
    model.eval()
    # final_targets = []
    final_outputs = []
    final_product_ids = []
    # final_loss = 0
    
    #loop through each batch
    with torch.no_grad():   #??
        for batch_index , data_batch in tqdm(enumerate(data_loader) , total = len(data_loader)):
            ids = data_batch['ids']
            mask =  data_batch['mask']
            # targets = data_batch['targets']
            product_ids = data_batch['product_ids']

            ids = ids.to(device, dtype = torch.long)
            mask = mask.to(device, dtype = torch.long)
            # targets = targets.to(device, dtype = torch.long)
            product_ids = product_ids.to(device, dtype = torch.long)

            outputs = model(
                ids = ids,
                mask = mask
            )
            
            # loss = loss_fn(outputs, targets)
            # final_loss += loss.item()
            
            # print("eval output" , outputs)
            final_product_ids.extend(product_ids.cpu().detach().numpy().tolist())
            # final_targets.extend(targets.cpu().detach().numpy().tolist())
            final_outputs.extend(outputs.cpu().detach().numpy().argmax(axis = 1).tolist())    #change this in case of multiple outputs

    return final_outputs, final_product_ids

In [None]:
#train
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder
from transformers import get_linear_schedule_with_warmup
import csv
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

# train_set = pd.read_pickle('/gdrive/MyDrive/amazon_ml_2021/df_sixM')

# test_set =  pd.read_pickle('/gdrive/MyDrive/amazon_ml_2021/test_data')
# decoder = joblib.load('/gdrive/MyDrive/amazon_ml_2021/decoder(5).joblib')

test_set =  pd.read_pickle('/gdrive/MyDrive/amazon_ml_2021/cleaned/test_clean')
decoder = joblib.load('/gdrive/MyDrive/amazon_ml_2021/cleaned/decoder_cleaned.joblib')


In [None]:
# train_set['ID'] = train_set.index

In [None]:
test_set.head()

Unnamed: 0,PRODUCT_ID,text
0,1,command m small kitchen hook white decorate da...
1,2,oneal jump hardware jag unisexadult glove blac...
2,3,nfl detroit lion portable party fridge quart ...
3,4,panasonic single line kxtsmx corded phone whit...
4,5,zero baby girl cotton innerwear bloomer drawe...


In [None]:
# #run on valid set
# _ , valid_set = model_selection.train_test_split(
#         train_set,
#         test_size = 0.05,
#         random_state = 899,
#         stratify = train_set.BROWSE_NODE_ID.values
#     )

In [None]:
def test(df_valid , num_class):
    df_valid = df_valid.reset_index(drop = True)

    valid_dataset = BERTDataset(
        sentence = df_valid.text.values , 
        product_id = df_valid.PRODUCT_ID.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset ,
        batch_size = VALIDATION_BATCH_SIZE, 
        num_workers = 1
    )
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("There are %s GPU's." %torch.cuda.device_count())
        print("GPU Name: " , torch.cuda.get_device_name(0))

    else:
        print("No GPU's Available :(")
        decive = torch.device("cpu")
    
    model = BERTBaseUncased(num_classes)
    model.to(device)
    
    print("Loading Model...")
    checkpoint = torch.load(OUTPUT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print("Model Loaded...")

    print("")
    print("Running Test...")
    t0 = time.time()
        
        
    outputs , final_product_ids = test_fn(valid_data_loader , model, device )
        
    validation_time = format_time(time.time() - t0)
    print("  Validation took: {:}".format(validation_time))

    return outputs , final_product_ids


In [None]:
num_classes = len(decoder)
outputs , product_ids =  test(test_set , num_classes)

There are 1 GPU's.
GPU Name:  Tesla T4


Some weights of the model checkpoint at /gdrive/MyDrive/BERT-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading Model...
Model Loaded...

Running Test...


HBox(children=(FloatProgress(value=0.0, max=27694.0), HTML(value='')))


  Validation took: 1:14:56


In [None]:
final_df = pd.DataFrame({'PRODUCT_ID' : product_ids , 'output' : outputs} , columns = ['PRODUCT_ID' , 'output'])
final_df.head()

Unnamed: 0,PRODUCT_ID,output
0,1,328
1,2,534
2,3,250
3,4,180
4,5,778


In [None]:
decoded_outputs = [decoder[output] for output in outputs]

In [None]:
print(decoded_outputs[:5])
print(outputs[:5])

[1140, 2321, 840, 604, 7644]
[328, 534, 250, 180, 778]


In [None]:
final_df['BROWSE_NODE_ID'] = decoded_outputs
final_df.head()

Unnamed: 0,PRODUCT_ID,output,BROWSE_NODE_ID
0,1,328,1140
1,2,534,2321
2,3,250,840
3,4,180,604
4,5,778,7644


In [None]:
final_df.drop('output' , axis = 1 , inplace= True)
final_df.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,2321
2,3,840
3,4,604
4,5,7644


In [None]:
final_df.to_csv('/content/predictions4.csv' , index= False)

In [None]:
def

/content


In [None]:
#train
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder
from transformers import get_linear_schedule_with_warmup
import csv
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def run(df):
#     df = pd.read_csv(TRAINING_FILE ,escapechar = "\\" , quoting = csv.QUOTE_NONE).fillna("none")
#     df = df.sample(n = 20000)
#     print(df.head())

#     key = df['BROWSE_NODE_ID'].value_counts()
#     for index, row in tqdm(df.iterrows(), total = df.shape[0]):
#         if(key[row['BROWSE_NODE_ID']] == 1):
#             df.drop(index, inplace = True)

#     le = LabelEncoder()
#     df['BROWSE_NODE_ID'] = le.fit_transform(df['BROWSE_NODE_ID'])
#     le_name_mapping = dict(zip( le.transform(le.classes_) , le.classes_))

    df_train, df_valid = model_selection.train_test_split(
        df,
        test_size = 0.1,
        random_state = 2000,
        stratify = df.BROWSE_NODE_ID.values
    )

    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)

    # df_train = df_train.sample(n = 10000)
    # df_test = df_test.sample(n = 1000)

    train_dataset = BERTDataset(
        sentence = df_train.text.values , 
        target = df_train.BROWSE_NODE_ID.values
    )

    valid_dataset = BERTDataset(
        sentence = df_valid.text.values , 
        target = df_valid.BROWSE_NODE_ID.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset ,
        batch_size = TRAIN_BATCH_SIZE, 
        num_workers = 1
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset ,
        batch_size = VALIDATION_BATCH_SIZE, 
        num_workers = 1
    )
    
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("There are %s GPU's." %torch.cuda.device_count())
        print("GPU Name: " , torch.cuda.get_device_name(0))

    else:
        print("No GPU's Available :(")
        decive = torch.device("cpu")
    
    model = BERTBaseUncased(df['BROWSE_NODE_ID'].nunique())
    model.to(device)
#     params = (list(model.named_parameters()))
#     for p in params:
#         print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
 
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]   

    num_train_steps = len(df_train)/TRAIN_BATCH_SIZE * EPOCHS
    optimizer = AdamW(
        optimizer_parameters,
        lr = 2e-5
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )
    
    checkpoint = torch.load(OUTPUT_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#     epoch = checkpoint['epoch']
#     loss = checkpoint['loss']
    
    training_stats = []
    best_accuracy = 0
    for epoch in range(EPOCHS):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
        print('Training...')
        t0 = time.time()
        
        avg_train_loss = train_fn(train_data_loader , model , optimizer , device , ACCUMULATION , scheduler)
        
        training_time = format_time(time.time() - t0)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
    
        print("")
        print("Running Validation...")
        t0 = time.time()
        
        
        outputs , targets , avg_val_loss = eval_fn(valid_data_loader , model, device )
        accuracy = metrics.accuracy_score(targets , outputs)
        
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))
        print("  Validation accuracy: {:}".format(accuracy))
        
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
        
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_val_loss,
            'acc' : accuracy
            }, OUTPUT_PATH)
        
        if(accuracy > best_accuracy):
#             torch.save(model.state_dict(), OUTPUT_PATH)
            print(f"Accuracy Score = {accuracy}")
            best_accuracy = accuracy
            
    return training_stats


In [None]:
#load data
df = pd.read_pickle('final_df')

# df.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'] ,axis = 1,  inplace = True)
df.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,text,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,"Pete The Cat Bedtime Blues Doll, 14.5 Inch Pet...",0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,The Ultimate Self-Sufficiency Handbook: A Comp...,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,Amway Nutrilite Kids Chewable Iron Tablets (10...,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,Teacher Planner Company A4 6 Lesson Academic T...,4


In [None]:

len(df)

2903024

In [None]:
# def drop_sparse_classes(df):
#     unique_labels, label_counts = np.unique(df.BROWSE_NODE_ID, return_counts=True)
#     drop_labels = unique_labels[label_counts < 10]
#     _df = df.apply(lambda x: x['BROWSE_NODE_ID'] in unique_labels[label_counts < 10], axis = 1)
#     df_drop = df[_df]
#     return df_drop

def preprocess(df):
    key = df['BROWSE_NODE_ID'].value_counts()
    print("No. of labels having only one sample : ", key.value_counts()[1])
    
    #So, we will have to remove those samples
    for index, row in tqdm(df.iterrows(), total = df.shape[0]):
        if(key[row['BROWSE_NODE_ID']] == 1):
            df.drop(index, inplace = True)

In [None]:
preprocess(df)
# df = drop_sparse_classes(df)

No. of labels having only one sample :  247


  0%|          | 0/2903024 [00:00<?, ?it/s]

In [None]:
df.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND'] ,axis = 1,  inplace = True)
print(len(df))
print(len(df['BROWSE_NODE_ID'].value_counts()))

2902777
9672


In [None]:
_ , df_sixM = model_selection.train_test_split(
        df,
        test_size = 0.175,
        random_state = 2000,
        stratify = df.BROWSE_NODE_ID.values
    )

In [None]:
preprocess(df_sixM)

No. of labels having only one sample :  2109


  0%|          | 0/507986 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
print(len(df_sixM))
print(len(df_sixM['BROWSE_NODE_ID'].value_counts()))
df_sixM.head()

505877
7026


Unnamed: 0,text,BROWSE_NODE_ID
1819235,CAPPL Artificial Vertical Garden Green Leaf Ar...,5254
118123,"Super Chunky Yarn, Soft Handspun 100% Merino B...",16946
2732176,Sharp Icon Plain Color Glitter Bling Designer ...,1045
496557,Silicone Pastry Clay Bakeware Baking Mat Tray ...,4753
2017026,Case Creation Love Birds Colorful 3D Diamond G...,1045


In [None]:
# !pip install joblib
import joblib

In [None]:
def encoding(df):
    le = LabelEncoder()
    df['BROWSE_NODE_ID'] = le.fit_transform(df['BROWSE_NODE_ID'])
    decoder = dict(zip( le.transform(le.classes_) , le.classes_))
    return decoder

In [None]:
decoder = encoding(df_sixM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
joblib.dump(decoder , 'decoder.joblib')

['decoder.joblib']

In [None]:
df_sixM.to_pickle('df_sixM')

In [None]:
# # preprocess(df_fiveM)
# _ , df_fiveM = model_selection.train_test_split(
#         df_fiveM,
#         test_size = 0.07,
#         random_state = 2000,
#         stratify = df_fiveM.BROWSE_NODE_ID.values
#     )
# preprocess(df_fiveM)
# decoder = encoding(df_fiveM)

In [None]:
training_stats = run(df_sixM)

There are 1 GPU's.
GPU Name:  Tesla P100-PCIE-16GB


Some weights of the model checkpoint at ../input/bert-base-uncased/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Training...


  0%|          | 0/56912 [00:00<?, ?it/s]


  Average training loss: 3.58
  Training epcoh took: 7:09:16

Running Validation...


  0%|          | 0/12647 [00:00<?, ?it/s]

  Validation Loss: 2.49
  Validation took: 0:16:59
  Validation accuracy: 0.6192575314303788
Accuracy Score = 0.6192575314303788

Training...


  0%|          | 0/56912 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
training_stats

In [None]:
import IPython

In [None]:
!pip install --upgrade IPython

Collecting IPython
  Downloading ipython-7.25.0-py3-none-any.whl (786 kB)
[K     |████████████████████████████████| 786 kB 604 kB/s eta 0:00:01
Installing collected packages: IPython
  Attempting uninstall: IPython
    Found existing installation: ipython 7.24.1
    Uninstalling ipython-7.24.1:
      Successfully uninstalled ipython-7.24.1
Successfully installed IPython-7.25.0


In [None]:
from IPython import display

In [None]:
display.FileLink(r'checkpoint.pt')