In [None]:
from datasets import load_dataset

review_dataset = load_dataset("csv", data_files="..\datasets\Hindi Product Review.csv", split="train")

In [None]:
review_dataset

In [None]:
review_dataset.set_format("pandas")

In [None]:
# get label counts for both classes
label_counts = review_dataset["Label"].value_counts()
num_labels = (len(label_counts.keys()))

In [None]:
label_counts

In [None]:
max_token_length = max(review_dataset['Text'].str.len())
max_token_length

In [None]:
review_dataset.reset_format()

In [None]:
from transformers import set_seed

set_seed(30)

In [None]:
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification

tokenizer = PreTrainedTokenizerFast.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi")

In [None]:
model =  AutoModelForSequenceClassification.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi", num_labels=2)

In [None]:
# num_added_tokens = tokenizer.add_tokens(["5","7","8","9"])

In [None]:
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
# model.resize_token_embeddings(len(tokenizer))

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)

In [None]:
from transformers import DataCollatorWithPadding

tokenized_dataset = review_dataset.map(tokenize_function, batched=True, remove_columns=["Text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset = tokenized_dataset.filter(lambda x:x if x["Label"] != 'Label' else None)
tokenized_dataset = tokenized_dataset.filter(lambda x:x if (x["Label"] != 'Label' and x["Label"] != 'neutral') else None)

In [None]:
tokenized_dataset

In [None]:
temp = tokenized_dataset.filter(lambda x:x if 0 in x["input_ids"] else None)

In [None]:
temp

In [None]:
def assign_label(example):
    #mapping = {"neutral":0, "positive":1, "negative":2}
    mapping = {"positive":0, "negative":1}
    example['Label'] = mapping[example['Label']]
    return example

In [None]:
tokenized_dataset = tokenized_dataset.map(assign_label)
tokenized_dataset = tokenized_dataset.rename_column("Label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

In [None]:
samples = [tokenized_dataset[i] for i in range(20)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
review_dataset[:20]

In [None]:
# downsampled_dataset = tokenized_dataset.train_test_split(
#     train_size=0.8, seed=42
# )
# downsampled_dataset

stratify_column_name = "labels"

# create class label column and stratify
downsampled_dataset = tokenized_dataset.class_encode_column(
    stratify_column_name
).train_test_split(
    test_size=0.2, 
    seed = 42,
    #stratify_by_column=stratify_column_name
)

In [None]:
downsampled_dataset

In [None]:
from torch.utils.data import DataLoader
batch_size = 16
# batch_size = 4

train_dataloader = DataLoader(
    downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    downsampled_dataset["test"], batch_size=batch_size, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
import torch
with torch.no_grad():
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)

In [None]:
from transformers import AdamW

# optimizer = AdamW(model.parameters(), lr=4e-5)
optimizer = AdamW(model.parameters(), lr=3e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
import numpy as np
import evaluate

metric_fun = evaluate.load("f1")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    metric_result = metric_fun.compute(predictions=predictions, references=labels, average='macro')
    return {
        "f1": metric_result["f1"],
    }

In [None]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import TrainingArguments

batch_size = 16
# batch_size = 4
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size


training_args = TrainingArguments(
    report_to = None,
    output_dir="models/bert-unigram-bengali-classifier",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="no",
    #learning_rate=4e-5,
    learning_rate=3e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #num_train_epochs=6,
    num_train_epochs=4,
    #push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    #metric_for_best_model = 'f1',
    #load_best_model_at_end=True,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
metric = trainer.evaluate()

In [None]:
# trainer.save_model()

In [None]:
metric['eval_f1']

In [None]:
y_preds, y_true, _ = trainer.predict(downsampled_dataset["test"] )

In [None]:
y_preds = np.argmax(y_preds, axis=-1)

In [None]:
# y_preds

In [None]:
from sklearn.metrics import classification_report
target_names = ["positive", "negative"]
print(classification_report(y_true, y_preds,target_names=target_names))

In [None]:
temp = review_dataset.filter(lambda x:x if x["Label"]=='negative' else None)

In [None]:
temp[:10]

In [None]:
import torch
from transformers import TrainingArguments, Trainer

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
scores = list()
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold

from datasets import load_dataset, DatasetDict

# First make the kfold object
folds = StratifiedKFold(n_splits=5)

tokenized_dataset = tokenized_dataset.shuffle(seed=42)
# tokenized_dataset = tokenized_dataset.shuffle(seed=30)

# Now make our splits based off of the labels. 
# We can use `np.zeros()` here since it only works off of indices, we really care about the labels
splits = folds.split(np.zeros(tokenized_dataset.num_rows), tokenized_dataset["labels"])

# In this case I'm overriding the train/val/test
for train_idxs, val_idxs in splits:
    fold_dataset = DatasetDict({
    "train":tokenized_dataset.select(train_idxs),
    "validation":tokenized_dataset.select(val_idxs),
    })
    
    
    model =  AutoModelForSequenceClassification.from_pretrained("../Hindi Pretraining/models/unigram/bert-base-pretrained-hindi", num_labels=2)
    model.to(device)

    batch_size = 16
    # Show the training loss with every epoch
    logging_steps = len(fold_dataset["train"]) // batch_size


    training_args = TrainingArguments(
        report_to = None,
        output_dir="models/cross-validation",
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=4,
        fp16=True,
        logging_steps=logging_steps,
        metric_for_best_model = 'f1',
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=fold_dataset["train"],
        eval_dataset=fold_dataset["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()
    
    metric = trainer.evaluate()
    scores.append(metric['eval_f1'])
    print(metric['eval_f1'])

In [None]:
scores

In [None]:
sum(scores)  / len(scores)