### Mount Google drive

*  Mount Google drive in the directory '/content/drive'

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install packages

In [4]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install sklearn
!pip install evaluate
!pip install datasets
!pip install emoji
!pip install tensorboardX
!pip install crc32c
!pip install soundfile
!pip install sentencepiece

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m4

In [5]:
import os
!set CUDA_LAUNCH_BLOCKING=1
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

### Huggingface

In [16]:
%%shell
git config --global credential.helper store
huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful




### Imports

In [6]:
import os
import csv
import torch
import sklearn
import datasets
import evaluate
import numpy as np
import pandas as pd
from torch import nn
import tensorflow as tf
from datasets import Dataset
from collections import Counter
import torch.nn.functional as F
from transformers import Trainer
from datasets import load_metric
from transformers import AutoTokenizer, AutoModel
from datasets import ClassLabel, Value
from transformers import create_optimizer
from transformers import DataCollatorWithPadding
from sklearn.metrics import classification_report
from transformers import TextClassificationPipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support
from transformers.keras_callbacks import KerasMetricCallback
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

### Loading Dataset

In [7]:
df = pd.read_csv("/content/drive/MyDrive/dataset/4/train.csv", delimiter=',', index_col=False, encoding='latin1')
validation_df = pd.read_csv("/content/drive/MyDrive/dataset/4/validate.csv", delimiter=',', index_col=False, encoding='latin1')

df = df.astype({"label": int})
validation_df = validation_df.astype({"label": int})
dataset = Dataset.from_pandas(df)
validation_dataset = Dataset.from_pandas(validation_df)

new_features = dataset.features.copy()
val_new_features = validation_dataset.features.copy()
print(dataset.features)
print(validation_dataset.features)

{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}
{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


### Loading Model

In [None]:
output_log_file = "log.csv"
model_checkpoint = "vinai/bertweet-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenizer.model_max_length = 512

In [None]:
def preprocess_data(examples):
    return tokenizer(examples["text"], truncation=True)

encoded_dataset = dataset.map(preprocess_data, batched = True)
encoded_val_dataset = validation_dataset.map(preprocess_data, batched = True)
pre_tokenizer_columns = set(dataset.features)
tokenizer_columns = list(set(encoded_dataset.features) - pre_tokenizer_columns)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training

In [14]:
num_epochs = 6
batch_size = 16
init_lr = 2e-5
num_warmup_steps = 0
weight_decay = 0.005
fold = 0
name = "baseline_" + model_checkpoint + "_epoch" + str(num_epochs) + "_batch" +  str(batch_size) + "_lr" +  str(init_lr) + "_w" +  str(weight_decay)

In [None]:
def compute_metric(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels.flatten(), preds.flatten(), average='binary', zero_division=0)
    return {
        'accuracy': (preds == labels).mean(),
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def py_sigmoid_focal_loss(pred, target, gamma=2.0, alpha=0.25, reduction='mean', avg_factor=None):
    pred_sigmoid = pred.sigmoid()
    target_one_hot = target.type_as(pred)
    target_one_hot = torch.nn.functional.one_hot(target, num_classes=2).float()
    loss = F.binary_cross_entropy_with_logits(pred, target_one_hot, reduce=False)
    pt = torch.exp(-loss)
    F_loss = alpha * (1-pt)**gamma * loss
    return torch.mean(F_loss) if reduction == 'mean' else F_loss

class FocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = py_sigmoid_focal_loss(logits.view(-1, self.model.config.num_labels), labels.view(-1), 2.0, 0.25, 'mean', None)
        return (loss, outputs) if return_outputs else loss

class BalancedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

print("Fine Tuninig Model")
encoded_training_dataset = encoded_dataset
encoded_validation_dataset = encoded_val_dataset

training_args = TrainingArguments(
    name,
    learning_rate=init_lr,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    warmup_steps=num_warmup_steps,
    logging_dir="/content/drive/MyDrive/dataset/logs4/"+ name + '/',
    push_to_hub=True
)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, ignore_mismatched_sizes=True)

trainer = BalancedTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_training_dataset,
    eval_dataset=encoded_validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metric,
)

trainer.train()
trainer.evaluate()

#### Saving Models

In [None]:
trainer.push_to_hub()

In [21]:
modelsave = "/content/drive/MyDrive/dataset/experiment4/" + name + "/model"
model.save_pretrained(modelsave)
tokenizer.save_pretrained(modelsave)

('/content/drive/MyDrive/dataset/experiment4/baseline_vinai/bertweet-base_epoch1_batch4_lr2e-05_w0.005/model/tokenizer_config.json',
 '/content/drive/MyDrive/dataset/experiment4/baseline_vinai/bertweet-base_epoch1_batch4_lr2e-05_w0.005/model/special_tokens_map.json',
 '/content/drive/MyDrive/dataset/experiment4/baseline_vinai/bertweet-base_epoch1_batch4_lr2e-05_w0.005/model/vocab.txt',
 '/content/drive/MyDrive/dataset/experiment4/baseline_vinai/bertweet-base_epoch1_batch4_lr2e-05_w0.005/model/bpe.codes',
 '/content/drive/MyDrive/dataset/experiment4/baseline_vinai/bertweet-base_epoch1_batch4_lr2e-05_w0.005/model/added_tokens.json')

### Inference

In [None]:
model.to('cpu')
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
validation_df = pd.read_csv("/content/drive/MyDrive/dataset/4/test.csv", delimiter=',', index_col=False, encoding='latin1')

In [None]:
validation_df = validation_df.dropna(subset=['label']).reset_index(drop=True)

In [None]:
y_true = y_true_int = [int(label) for label in validation_df.label.values]

In [None]:
from sklearn.metrics import classification_report
y_pred = []
for i in range(len(validation_df)):
  if (i!=0 and i%100==0):
    print(i)
  text = validation_df['text'][i]
  output = pipe(text,truncation=True)
  if output[0][0]["score"] > output[0][1]["score"]:
    label = 0
  else:
    label = 1
  y_pred.append(label)

In [None]:
print(classification_report(y_true, y_pred, labels=[0,1]))

In [None]:
report = classification_report(y_true, y_pred, labels=[0,1], output_dict=True)
label_1_f1 = report['1']['f1-score']
accuracy = report['accuracy']
weighted_f1 = report['macro avg']['f1-score']

print("Label 1 F1:", label_1_f1)
print("Accuracy:", accuracy)
print("Macro F1:", weighted_f1)

In [None]:
output_file = '/content/drive/MyDrive/dataset/experiment4/baseline/'+ model_checkpoint

In [None]:
df4 = pd.DataFrame(report).transpose()
results_log = output_file + '/' + 'test_performance.csv'
df4.to_csv(results_log, index=True)

#### Get the Results

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/dataset/4/test.csv", delimiter=',', index_col=False, encoding='latin1')

In [None]:
test_df = test_df[(test_df['label'] == '1') | (test_df['label'] == '0')]
test_df['label'] = test_df['label'].astype(int)
test_df = test_df.dropna(subset=['label']).reset_index(drop=True)
y_true = y_true_int = [int(label) for label in test_df.label.values]

In [None]:
l = []
for i in range(len(test_df)):
  if (i!=0 and i%100==0):
    print(i)
  text = test_df['text'][i]
  l.append((test_df['text'][i], pipe(text,truncation=True)))

In [None]:
results_file = "/content/drive/MyDrive/dataset/experiment4/" + name + '/' + 'test_result.csv'
with open(results_file, 'w', newline='') as out_file:
    writer = csv.writer(out_file)
    writer.writerow(['id', 'label'])  # 写入列名
    for i in range(len(test_df)):
        label = 0
        if l[i][1][0][0]["score"] > l[i][1][0][1]["score"]:
            label = 0
        else:
            label = 1
        writer.writerow([test_df.iloc[i, 0], label])

In [None]:
def sort_submission_csv(submission_file):
    with open(submission_file, 'r', newline='') as file:
        reader = csv.reader(file)
        rows = list(reader)
        header = rows[0]
        sorted_rows = sorted(rows[1:], key=lambda x: int(x[0]))
        sorted_rows.insert(0, header)

    sorted_tsv_file = "/content/drive/MyDrive/dataset/experiment4/" + name + '/' + 'test_result.tsv'
    with open(sorted_tsv_file, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerows(sorted_rows)
    print(f'Sorted TSV file saved as: {sorted_tsv_file}')

csv_file_path = results_file
sort_submission_csv(csv_file_path)