# Setup and Preparation



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# %cd drive/MyDrive/data_nlp_proj
%cd drive/MyDrive/

In [None]:
%pip install datasets
%pip install evaluate
%pip install transformers[torch]
%pip install -U accelerate
! pip install optuna
! pip install ray[tune]

In [None]:
def load_data(train,test):
  train_ds = datasets.load_dataset("csv", data_files={"train": train, "test":test})
  return train_ds
train_ds = load_data('eclipse_train.csv', 'eclipse_test.csv')

In [None]:
print(train_ds['train'][1])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(examples["text"], padding=True, max_length=128, truncation=True,return_tensors="pt")
    # result['label'] = examples['Status']


# c2l = ClassLabel(num_classes=2, names=['nondup', 'duplicate'])

train = train_ds.map(
preprocess_function,
batched=True,
desc="Running tokenizer on dataset",
)


In [None]:
print(train['train'])

In [None]:
from transformers import DataCollatorWithPadding
from transformers import BertForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
training_args = TrainingArguments(output_dir='android', evaluation_strategy="epoch")
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
metric3 = evaluate.load("precision")
metric4 = evaluate.load("recall")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):



    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric3.compute(predictions=predictions, references=labels)
    recall = metric4.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels)
    accuracy = metric1.compute(predictions=predictions, references=labels)

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train['train'],
      eval_dataset=train['test'],
      data_collator=data_collator,
      compute_metrics=compute_metrics
  )
trainer.train() 


# BERT (Default parameters)

In [None]:
trainer.evaluate()

# BERT (Hyperparameter-Tuned)

In [None]:
# Added temporarily
from transformers import DataCollatorWithPadding

def model_init():
    return AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

# Added temporarily
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
metric3 = evaluate.load("precision")
metric4 = evaluate.load("recall")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics_tune(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric2.compute(predictions=predictions, references=labels)

# Added temporarily
training_args = TrainingArguments(output_dir='android', evaluation_strategy="epoch")

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train["train"].shard(index=1, num_shards=10) ,
    eval_dataset=train["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_tune
)

best_run = trainer.hyperparameter_search(n_trials=5, direction="maximize")

In [None]:
best_run

# ELECTRA (Fine-Tuned)

In [None]:
from transformers import DataCollatorWithPadding
from transformers import BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(examples["text"], padding=True, max_length=128, truncation=True,return_tensors="pt")
    # result['label'] = examples['Status']


# c2l = ClassLabel(num_classes=2, names=['nondup', 'duplicate'])

train = train_ds.map(
preprocess_function,
batched=True,
desc="Running tokenizer on dataset",
)
")

def preprocess_function(examples):
    # Tokenize the texts
    return tokenizer(examples["text"], padding=True, max_length=128, truncation=True,return_tensors="pt")
    # result['label'] = examples['Status']


# c2l = ClassLabel(num_classes=2, names=['nondup', 'duplicate'])

train = train_ds.map(
preprocess_function,
batched=True,
desc="Running tokenizer on dataset",
)

model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)
training_args = TrainingArguments(output_dir="test_trainer/android", evaluation_strategy="epoch")
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
metric3 = evaluate.load("precision")
metric4 = evaluate.load("recall")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):



    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric3.compute(predictions=predictions, references=labels)
    recall = metric4.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels)
    accuracy = metric1.compute(predictions=predictions, references=labels)

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train['train'],
      eval_dataset=train['test'],
      data_collator=data_collator,
      compute_metrics=compute_metrics
  )
trainer.train()