In [None]:
from datasets import load_dataset

column_names=["labels","text"]
train_dataset = load_dataset("csv", data_files="..\datasets\iitp-movie-reviews\hi\hi-train.csv", split="train", column_names=column_names, delimiter=',')

In [None]:
val_dataset = load_dataset("csv", data_files="..\datasets\iitp-movie-reviews\hi\hi-valid.csv", split="train", column_names=["labels","text"], delimiter=',')

In [None]:
test_dataset = load_dataset("csv", data_files="..\datasets\iitp-movie-reviews\hi\hi-test.csv", split="train", column_names=["labels","text"], delimiter=',')

In [None]:
from datasets import Dataset, DatasetDict

review_datasets = DatasetDict()
review_datasets['train'] = train_dataset
review_datasets['validation'] = val_dataset
review_datasets['test'] = test_dataset

In [None]:
review_datasets

In [None]:
train_dataset.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = train_dataset["labels"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
max_token_length = max(train_dataset['text'].str.len())
max_token_length

In [None]:
train_dataset.reset_format()

In [None]:
from transformers import set_seed

# set_seed(30)
set_seed(42)

In [None]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer

#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
model = BertForSequenceClassification(config=config)

In [None]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

In [None]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
model.bert = character_bert_model

In [None]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

In [None]:
tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [None]:
# num_added_tokens = tokenizer.add_tokens(["5","7","8","9"])

In [None]:
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
# model.resize_token_embeddings(len(tokenizer))

In [None]:
def tokenize_function(example):
    #return tokenizer(example["text"], truncation=True)
    return tokenizer(example["text"], truncation=True, max_length=128)

In [None]:
from transformers import DataCollatorWithPadding

tokenized_datasets = review_datasets.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets

In [None]:
temp = tokenized_datasets.filter(lambda x:x if 0 in x["input_ids"] else None)

In [None]:
temp

In [None]:
def assign_label(example):
    mapping = {"neutral":0, "positive":1, "negative":2}
    example['labels'] = mapping[example['labels']]
    return example

In [None]:
tokenized_datasets = tokenized_datasets.map(assign_label)
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

In [None]:
samples = [tokenized_datasets["train"][i] for i in range(5)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
review_datasets["train"][:5]

In [None]:
from torch.utils.data import DataLoader
# batch_size = 16
batch_size = 32

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
import torch
with torch.no_grad():
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)

In [None]:
import numpy as np
import evaluate

metric_fun = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    metric_result = metric_fun.compute(references=labels, predictions=predictions)
    return {
        "accuracy": metric_result["accuracy"],
    }

In [None]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments

# batch_size = 16
batch_size = 32
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size


training_args = TrainingArguments(
    output_dir="models/bert-unigram-hindi-classifier",
    report_to = None,
    save_strategy="no",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    #learning_rate=3e-5,
    weight_decay=0.01,
    #weight_decay=0.02,
    #warmup_ratio = 0.1,
    #warmup_ratio = 0.05,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    #num_train_epochs=4,
    #push_to_hub=True,
    fp16=True,
)

In [None]:
# from datasets import concatenate_datasets

# entire_train = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["validation"]]) 

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    #train_dataset=entire_train,
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
# trainer.save_model()

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("models/bert-unigram-hindi-classifier")
# model.to("cuda")

In [None]:
trainer.evaluate(tokenized_datasets["validation"])