In [1]:
from datasets import load_dataset

column_names=["labels","text"]
train_dataset = load_dataset("csv", data_files="..\datasets\iitp-product-reviews\hi\hi-train.csv", split="train", column_names=column_names, delimiter=',')

Found cached dataset csv (C:/Users/arifa/.cache/huggingface/datasets/csv/default-a55165b542bdbb4e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [2]:
val_dataset = load_dataset("csv", data_files="..\datasets\iitp-product-reviews\hi\hi-valid.csv", split="train", column_names=["labels","text"], delimiter=',')

Found cached dataset csv (C:/Users/arifa/.cache/huggingface/datasets/csv/default-cce6e8c355045cbb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [3]:
test_dataset = load_dataset("csv", data_files="..\datasets\iitp-product-reviews\hi\hi-test.csv", split="train", column_names=["labels","text"], delimiter=',')

Found cached dataset csv (C:/Users/arifa/.cache/huggingface/datasets/csv/default-810710c2a6679d6b/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [4]:
from datasets import Dataset, DatasetDict

review_datasets = DatasetDict()
review_datasets['train'] = train_dataset
review_datasets['validation'] = val_dataset
review_datasets['test'] = test_dataset

In [5]:
review_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 4182
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 523
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 523
    })
})

In [6]:
train_dataset.set_format("pandas")

In [7]:
# get label counts for both classes
label_counts = train_dataset["labels"].value_counts()
num_labels = (len(label_counts.keys()))

In [8]:
label_counts

labels
positive    1826
neutral     1789
negative     567
Name: count, dtype: int64

In [9]:
max_token_length = max(train_dataset['text'].str.len())
max_token_length

354

In [10]:
train_dataset.reset_format()

In [11]:
from transformers import set_seed

#set_seed(30)
set_seed(42)

In [12]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer

#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
model = BertForSequenceClassification(config=config)

In [13]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

Embedding(30522, 768, padding_idx=0)

In [14]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
model.bert = character_bert_model

Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

CharacterCnn(
  (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
  (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
  (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
  (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
  (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
  (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
  (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
  (_highways): Highway(
    (_layers): ModuleList(
      (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
    )
  )
  (_projection): Linear(in_features=2048, out_features=768, bias=True)
)

In [16]:
tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [17]:
# num_added_tokens = tokenizer.add_tokens(["5","7","8","9"])

In [18]:
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
# model.resize_token_embeddings(len(tokenizer))

In [19]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)
    #return tokenizer(example["text"], truncation=True, max_length=128)

In [20]:
from transformers import DataCollatorWithPadding

tokenized_datasets = review_datasets.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-a55165b542bdbb4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c205e3a2333a16a3.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-cce6e8c355045cbb\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-38febcb544c5b30f.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-810710c2a6679d6b\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b45b299d2127d356.arrow


In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4182
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 523
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 523
    })
})

In [22]:
temp = tokenized_datasets.filter(lambda x:x if 0 in x["input_ids"] else None)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-a55165b542bdbb4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-13c4ee372134cb06.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-cce6e8c355045cbb\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b8d44b70d97003e5.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-810710c2a6679d6b\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-a57ae5c74bbd5968.arrow


In [23]:
temp

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 0
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 0
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 0
    })
})

In [24]:
for sample in temp["train"]:
    print(tokenizer.decode(sample["input_ids"]))

In [25]:
for sample in review_datasets["train"]:
    if "HD" in sample["text"]:
        print(sample["text"])

माइक्रोसॉफ्ट सरफेस 3 में 10.8 इंच की क्लियरटाइप फुल HD प्लस डिस्प्ले स्क्रीन है ।
टैबलेट के दाहिनी उपरी तरफ वाल्यूम रॉकर के बगल में पॉवर ऑन ऑफ बटन है जबकि पोंर्ट्रेट मोड में पकड़ने परा नीचे मिनी HDMI एवं मिनी USB पोर्ट तथा चार्जर की जगह है ।
इसके अलावा , कनेक्टिविटी के लिए यूएसबी 3.0 , यूएसबी 2.0 और HDMI पोर्ट दिए गए हैं ।
ये टीवी एंड्रॉइड ऑपरेटिंग सिस्टम पर भी काम करेगी , जिसका मॉडल नंबर MI TV 2 40-Inch Full - HD है ।
इसी जगह माइक्रोमैक्स के 4K टीवी HD से 8 गुना बेहतर क्वालिटी वाले 42 इंच स्क्रीन वाले टीवी की कीमत 39990 रुपए रखी गई है ।
माइक्रोमैक्स के इस टेलीविजन का मॉडल 50B5000FHD LED स्पोर्ट्स के नाम से है ।
टैबलेट के बाई तरफ 3.5 मिमी जैक , एचडीएमआई HDMI , मिनी यूएसवी , माइक्रो एसडी तथा चार्जिंग साकेट के सभी पोर्टस लगे है ।
माइक्रोमैक्स फनबुक की एक और विशेषता है , इसमें लगा डूअल कोर 400 MHz का ग्राफिक प्रोसेसर जिससे इस डिवाइस पर फुल HD 1080p का वीडियो प्लेबैक की सुविधा भी है , पर एक बार फिर कम डिसप्ले रिजॉल्यूशन हाई डेफिनिशन कॉन्टेंट के मजे को खराब कर देता है जिससे यूजर मिनी HDMI प

In [26]:
def assign_label(example):
    mapping = {"neutral":0, "positive":1, "negative":2}
    example['labels'] = mapping[example['labels']]
    return example

In [27]:
tokenized_datasets = tokenized_datasets.map(assign_label)
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-a55165b542bdbb4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-15a48d2301b08239.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-cce6e8c355045cbb\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-c38229a2ea2bdfdd.arrow
Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-810710c2a6679d6b\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-844bdf4ddd3bf1ab.arrow


{'train': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'validation': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 'test': ['labels', 'input_ids', 'token_type_ids', 'attention_mask']}

In [28]:
samples = [tokenized_datasets["train"][i] for i in range(5)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] एंडराॅयड के मामले में यह थोड़ा पीछे है । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] यह एस्पेक्ट रेशो का ईश्यू है और हम आशा करते हैं कि यह आने वाले अपडेट में फिक्स कर दिया जाएगा । [SEP]'

'>>> [CLS] लेकिन इस तरह के एक मॉडल के एक घर कंप्यूटर के लिए एक शानदार विकल्प हो सकता है । [SEP] [PAD] [PAD] [PAD]'

'>>> [CLS] गिर वन राष्ट्रीय उद्यान बाघ संरक्षित क्षेत्र है जो एशियाई बब्बर शेर के लिए विश्व प्रसिद्ध है । [SEP] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] और हां, इस फिल्म में हर किरदार भारद्वाज को भरद्वाज क्यों बुलाता है? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'


In [29]:
review_datasets["train"][:5]

{'labels': ['negative', 'neutral', 'positive', 'positive', 'neutral'],
 'text': ['एंडराॅयड के मामले में यह थोड़ा पीछे है ।',
  'यह एस्पेक्ट रेशो का ईश्यू है और हम आशा करते हैं कि यह आने वाले अपडेट में फिक्स कर दिया जाएगा ।',
  'लेकिन इस तरह के एक मॉडल के एक घर कंप्यूटर के लिए एक शानदार विकल्प हो सकता है ।',
  'गिर वन राष्ट्रीय उद्यान बाघ संरक्षित क्षेत्र है जो एशियाई बब्बर शेर के लिए विश्व प्रसिद्ध है ।',
  'और हां , इस फिल्म में हर किरदार भारद्वाज को भरद्वाज क्यों बुलाता है ?']}

In [30]:
from torch.utils.data import DataLoader
# batch_size = 16
batch_size = 32

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)

In [31]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([32]),
 'input_ids': torch.Size([32, 35, 50]),
 'token_type_ids': torch.Size([32, 35]),
 'attention_mask': torch.Size([32, 35])}

In [32]:
import torch
with torch.no_grad():
    outputs = model(**batch)
    print(outputs.loss, outputs.logits.shape)

tensor(1.1339) torch.Size([32, 3])


In [33]:
import numpy as np
import evaluate

metric_fun = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    metric_result = metric_fun.compute(references=labels, predictions=predictions)
    return {
        "accuracy": metric_result["accuracy"],
    }

In [34]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [35]:
from transformers import TrainingArguments

batch_size = 16
# batch_size = 32
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size


training_args = TrainingArguments(
    output_dir="models/bert-unigram-hindi-classifier",
    report_to = None,
    save_strategy="no",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    #learning_rate=3e-5,
    weight_decay=0.01,
    #weight_decay=0.02,
    #warmup_ratio = 0.1,
    #warmup_ratio = 0.05,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    #num_train_epochs=5,
    #push_to_hub=True,
    fp16=True,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [36]:
# from datasets import concatenate_datasets

# entire_train = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["validation"]]) 

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    #train_dataset=entire_train,
    eval_dataset=tokenized_datasets["validation"],
    #eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [38]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [39]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 523
  Batch size = 16


{'eval_loss': 1.126199722290039,
 'eval_accuracy': 0.30975143403441685,
 'eval_runtime': 3.8778,
 'eval_samples_per_second': 134.871,
 'eval_steps_per_second': 8.51}

In [40]:
trainer.train()

***** Running training *****
  Num examples = 4182
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1048


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.652592,0.715105
2,0.656500,0.622668,0.743786
3,0.656500,0.691729,0.74761
4,0.308400,0.778569,0.749522


***** Running Evaluation *****
  Num examples = 523
  Batch size = 16
***** Running Evaluation *****
  Num examples = 523
  Batch size = 16
***** Running Evaluation *****
  Num examples = 523
  Batch size = 16
***** Running Evaluation *****
  Num examples = 523
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1048, training_loss=0.4679371405193824, metrics={'train_runtime': 124.7303, 'train_samples_per_second': 134.113, 'train_steps_per_second': 8.402, 'total_flos': 2.100212315721e+16, 'train_loss': 0.4679371405193824, 'epoch': 4.0})

In [41]:
trainer.evaluate(tokenized_datasets["test"])

***** Running Evaluation *****
  Num examples = 523
  Batch size = 16


{'eval_loss': 0.7257173657417297,
 'eval_accuracy': 0.7609942638623327,
 'eval_runtime': 1.3886,
 'eval_samples_per_second': 376.628,
 'eval_steps_per_second': 23.764,
 'epoch': 4.0}

In [152]:
# trainer.save_model()

In [79]:
# model = AutoModelForSequenceClassification.from_pretrained("models/bert-unigram-hindi-classifier")
# model.to("cuda")

In [42]:
trainer.evaluate(tokenized_datasets["validation"])

***** Running Evaluation *****
  Num examples = 523
  Batch size = 16


{'eval_loss': 0.7785689234733582,
 'eval_accuracy': 0.7495219885277247,
 'eval_runtime': 1.4621,
 'eval_samples_per_second': 357.713,
 'eval_steps_per_second': 22.571,
 'epoch': 4.0}