In [1]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\train.jsonl", \
                             split="train")

Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-4aa02a47edd70562/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [2]:
val_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\val.jsonl", \
                            split="train")

Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-bfbd5b7cdea343df/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [3]:
test_dataset = load_dataset("json", data_files="..\datasets\copa-translated\\hi\\test.jsonl", \
                             split="train")

Found cached dataset json (C:/Users/arifa/.cache/huggingface/datasets/json/default-0e1a32d75e6ec46c/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [4]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer

tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [5]:
import pandas as pd

train_df = pd.DataFrame(columns=['premise', 'cause', 'label'])
val_df = pd.DataFrame(columns=['premise', 'cause','label'])
test_df = pd.DataFrame(columns=['premise', 'cause','label'])

In [6]:
choice_names = ['choice1', 'choice2']

In [7]:
translation = {
    'cause': ' कारण क्या है? ',
    'effect':' परिणाम क्या है? '
}

In [8]:
temp = "क्योंक|"
temp[:-1]

'क्योंक'

In [9]:
def preprocess_function(examples, df):
    idx = 0
    for sample in examples:
        df.loc[idx] = pd.Series({'premise':sample["premise"]+ translation[sample["question"]] +sample['choice1'], \
                                 'cause':sample["premise"]+ translation[sample["question"]]  +sample['choice2'],\
                                     'label': sample["label"]})
        idx = idx + 1
    
    return df

In [10]:
train_df = preprocess_function(train_dataset, train_df)

In [11]:
val_df = preprocess_function(val_dataset, val_df)

In [12]:
test_df = preprocess_function(test_dataset, test_df)

In [13]:
len(train_df)

362

In [14]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [15]:
datasets = DatasetDict()
datasets['train'] = train_ds
datasets['validation'] = val_ds
datasets['test'] = test_ds

In [16]:
datasets['train'][0]

{'premise': 'मेरे शरीर ने घास पर छाया डाली। कारण क्या है? सूरज उग रहा था।',
 'cause': 'मेरे शरीर ने घास पर छाया डाली। कारण क्या है? घास काटी गई।',
 'label': 0,
 '__index_level_0__': 0}

In [17]:
train_dataset[:1]

{'premise': ['मेरे शरीर ने घास पर छाया डाली।'],
 'choice1': ['सूरज उग रहा था।'],
 'choice2': ['घास काटी गई।'],
 'question': ['cause'],
 'idx': [0],
 'label': [0]}

In [18]:
def tokenize_function(example):
    return tokenizer(example["premise"],example["cause"], truncation=True)

In [19]:
datasets

DatasetDict({
    train: Dataset({
        features: ['premise', 'cause', 'label', '__index_level_0__'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['premise', 'cause', 'label', '__index_level_0__'],
        num_rows: 88
    })
    test: Dataset({
        features: ['premise', 'cause', 'label', '__index_level_0__'],
        num_rows: 449
    })
})

In [20]:
from transformers import DataCollatorWithPadding

tokenized_datasets = datasets.map(tokenize_function, batched=True, remove_columns=['premise','cause','__index_level_0__'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 88
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 449
    })
})

In [22]:
# def assign_label(example):
#     mapping = {' परिणाम क्या है? 0':0, ' परिणाम क्या है? 1':1,' कारण क्या है? 0':2, ' कारण क्या है? 1':3}
#     example['labels'] = mapping[example['label']]
#     return example

In [23]:
# tokenized_datasets = tokenized_datasets.map(assign_label).remove_columns('label')

In [24]:
len(tokenized_datasets["train"]['input_ids'][0])

36

In [25]:
for index, sample in enumerate(tokenized_datasets["train"][:5]):
    print(tokenizer.decode(tokenized_datasets["train"]['input_ids'][index]))
    print(tokenized_datasets["train"]['label'][index])

[CLS] मेरे शरीर ने घास पर छाया डाली । कारण क्या है? सूरज उग रहा था । [SEP] मेरे शरीर ने घास पर छाया डाली । कारण क्या है? घास काटी गई । [SEP]
0
[CLS] महिला ने अपने दोस्त के कठिन व्यवहार को सहन किया । कारण क्या है? महिला को पता था कि उसका दोस्त कठिन समय से गुजर रहा है । [SEP] महिला ने अपने दोस्त के कठिन व्यवहार को सहन किया । कारण क्या है? महिला को लगा कि उसके दोस्त ने उसकी दया का फायदा उठाया । [SEP]
0
[CLS] महिलाएं कॉफी के लिए मिलीं । कारण क्या है? एक नए स्थान में कैफे फिर से खुल गया । [SEP] महिलाएं कॉफी के लिए मिलीं । कारण क्या है? वे एक - दूसरे को पकड़ना चाहते थे । [SEP]
1
[CLS] धावक ने शॉर्ट्स पहनी थी । कारण क्या है? पूर्वानुमान में उच्च तापमान की भविष्यवाणी की गई थी । [SEP] धावक ने शॉर्ट्स पहनी थी । कारण क्या है? उसने समुद्र तट के साथ दौड़ने की योजना बनाई । [SEP]
0


In [26]:
for index, sample in enumerate(tokenized_datasets["test"][:5]):
    print(tokenizer.decode(tokenized_datasets["test"]['input_ids'][index]))
    print(tokenized_datasets["test"]['label'][index])

[CLS] आइटम को बबल रैप में पैक किया गया था । कारण क्या है? यह नाजुक था । [SEP] आइटम को बबल रैप में पैक किया गया था । कारण क्या है? छोटा था । [SEP]
0
[CLS] मैंने अपनी जेबें खाली कर दीं । परिणाम क्या है? मैंने एक टिकट स्टब को पुनः प्राप्त किया । [SEP] मैंने अपनी जेबें खाली कर दीं । परिणाम क्या है? मुझे एक हथियार मिला । [SEP]
0
[CLS] दीमक ने घर पर आक्रमण कर दिया । परिणाम क्या है? दीमक घर से गायब हो गए । [SEP] दीमक ने घर पर आक्रमण कर दिया । परिणाम क्या है? दीमक घर में लकड़ी के माध्यम से खाया । [SEP]
1
[CLS] यात्री सीमा पर पहुंच गए । परिणाम क्या है? गश्ती एजेंट ने उनके पासपोर्ट की जाँच की । [SEP] यात्री सीमा पर पहुंच गए । परिणाम क्या है? गश्त एजेंट ने उन पर तस्करी का आरोप लगाया । [SEP]
0


In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 362
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 88
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 449
    })
})

In [28]:
import evaluate

accuracy = evaluate.load("accuracy")

In [29]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [40]:
from transformers import set_seed
# set_seed(42)
set_seed(1)

In [41]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, CharacterBertTokenizer,\
     TrainingArguments, Trainer

#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, classifier_dropout=0.1)  # binary classification
model = BertForSequenceClassification(config=config)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at C:\Users\arifa/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [42]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

Embedding(30522, 768, padding_idx=0)

In [43]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
model.bert = character_bert_model

loading configuration file E:\Documents\Character Bert\Hate Speech\character-bert-hindi\config.json
Model config CharacterBertConfig {
  "_name_or_path": "helboukkouri/character-bert",
  "architectures": [
    "CharacterBertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "auto_map": {
    "AutoConfig": "configuration_character_bert.CharacterBertConfig",
    "AutoModel": "modeling_character_bert.CharacterBertForPreTraining",
    "AutoModelForMaskedLM": "modeling_character_bert.CharacterBertForMaskedLM"
  },
  "character_embeddings_dim": 16,
  "cnn_activation": "relu",
  "cnn_filters": [
    [
      1,
      32
    ],
    [
      2,
      32
    ],
    [
      3,
      64
    ],
    [
      4,
      128
    ],
    [
      5,
      256
    ],
    [
      6,
      512
    ],
    [
      7,
      1024
    ]
  ],
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max

In [44]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

CharacterCnn(
  (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
  (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
  (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
  (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
  (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
  (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
  (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
  (_highways): Highway(
    (_layers): ModuleList(
      (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
    )
  )
  (_projection): Linear(in_features=2048, out_features=768, bias=True)
)

In [45]:
#  disable weights and biases logging
import os
os.environ["WANDB_DISABLED"] = "true"

In [46]:
batch_size=32

training_args = TrainingArguments(
    output_dir="character_bert_model",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    #learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    #num_train_epochs=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    #weight_decay=0.04,
    #fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets["validation"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [47]:
import transformers
trainer.remove_callback(transformers.integrations.TensorBoardCallback)

In [48]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 449
  Batch size = 32


{'eval_loss': 0.701155960559845,
 'eval_accuracy': 0.512249443207127,
 'eval_runtime': 1.8708,
 'eval_samples_per_second': 240.001,
 'eval_steps_per_second': 8.018}

In [49]:
trainer.train()

***** Running training *****
  Num examples = 362
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 36


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.703843,0.47216
2,No log,0.702336,0.507795
3,No log,0.70711,0.496659


***** Running Evaluation *****
  Num examples = 449
  Batch size = 32
***** Running Evaluation *****
  Num examples = 449
  Batch size = 32
***** Running Evaluation *****
  Num examples = 449
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=36, training_loss=0.6701971160040961, metrics={'train_runtime': 16.6725, 'train_samples_per_second': 65.137, 'train_steps_per_second': 2.159, 'total_flos': 1873413827551200.0, 'train_loss': 0.6701971160040961, 'epoch': 3.0})

In [162]:
# trainer.save_model()

In [138]:
trainer.evaluate(tokenized_datasets["validation"])

***** Running Evaluation *****
  Num examples = 88
  Batch size = 32


{'eval_loss': 0.6950161457061768,
 'eval_accuracy': 0.48863636363636365,
 'eval_runtime': 0.3773,
 'eval_samples_per_second': 233.206,
 'eval_steps_per_second': 7.95,
 'epoch': 2.0}