In [None]:
%pip uninstall --y tensorflow

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0


In [None]:
%%capture
%pip install transformers[torch]
%pip install datasets
%pip install evaluate

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [None]:
%pip install huggingface_hub



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
import requests
import pandas as pd
import numpy as np
import evaluate


# label2id & id2label
idTolabel = {0: "fake", 1: "real"}

# Load tokenizer and model
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
# model_name = "yartyjung/model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,id2label=idTolabel)

# Dataset urls variable
train_url = "https://raw.githubusercontent.com/Yartyjung/AI-Builders-2024/main/dataset/train.csv"
validation_url = "https://raw.githubusercontent.com/Yartyjung/AI-Builders-2024/main/dataset/validation.csv"
test_url = "https://raw.githubusercontent.com/Yartyjung/AI-Builders-2024/main/dataset/test.csv"

# Load dataset
train_df = pd.read_csv(train_url,sep=",")
eval_df = pd.read_csv(validation_url,sep=",")
test_df = pd.read_csv(test_url,sep=",")
label2id = {"fake":0,"real":1}

#label to id func
def labelToid(dataframe1,dataframe2,dataframe3):
  dataframe1['label'] = dataframe1['label'].map(label2id)
  dataframe2['label'] = dataframe2['label'].map(label2id)
  dataframe3['label'] = dataframe3['label'].map(label2id)

labelToid(train_df,eval_df,test_df)

# Tokenize datasets
def tokenize_function(dataset_name):
    return tokenizer(dataset_name['text'], padding=True,truncation=True)

dataset_dict = DatasetDict({
    "train":Dataset.from_pandas(train_df),
    "eval":Dataset.from_pandas(eval_df),
    "test":Dataset.from_pandas(test_df)
})
# dataset_dict["train"][:10]
train_dataset = dataset_dict['train'].map(tokenize_function)
eval_dataset = dataset_dict['eval'].map(tokenize_function)
test_dataset = dataset_dict['test'].map(tokenize_function)

# Prepare data loaders
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=data_collator)

#metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision","recall"])
    return clf_metrics.compute(predictions, labels)
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=30,
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end = True,
    metric_for_best_model ="accuracy"

)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

model.push_to_hub("model")
tokenizer.push_to_hub("model")

tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3975 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1325 [00:00<?, ? examples/s]

Map:   0%|          | 0/1326 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4761,0.509958,0.732075,0.732479,0.618321,0.898336
2,0.4457,0.498773,0.742642,0.698497,0.669492,0.730129
3,0.5626,0.546174,0.704151,0.709199,0.592317,0.883549
4,0.491,0.684633,0.731321,0.745351,0.607935,0.963031
5,0.6423,0.515666,0.681509,0.715633,0.563097,0.981516
6,0.4163,0.505949,0.723774,0.738944,0.601626,0.957486
7,0.4863,0.469388,0.777358,0.769351,0.666667,0.909427
8,0.4514,0.475231,0.773585,0.772382,0.655084,0.94085
9,0.4292,0.468,0.799245,0.759494,0.743363,0.77634
10,0.3873,0.494769,0.798491,0.758808,0.742049,0.77634


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

KeyboardInterrupt: 

#push epoch 9 to huggingface model

In [None]:
modelname = "/content/results/checkpoint-2241"
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels=2,id2label=idTolabel)

model.push_to_hub("model")
tokenizer.push_to_hub("model")


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/421M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yartyjung/model/commit/44650d53545cd9bc855733191a8613db09cf70d2', commit_message='Upload tokenizer', commit_description='', oid='44650d53545cd9bc855733191a8613db09cf70d2', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
import requests
import pandas as pd
import numpy as np
import evaluate


# label2id & id2label
idTolabel = {0: "fake", 1: "real"}

# Load tokenizer and model
# model_name = "airesearch/wangchanberta-base-att-spm-uncased"
model_name = "yartyjung/model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,id2label=idTolabel)

# Dataset urls variable
train_url = "https://raw.githubusercontent.com/Yartyjung/AI-Builders-2024/main/dataset/train.csv"
validation_url = "https://raw.githubusercontent.com/Yartyjung/AI-Builders-2024/main/dataset/validation.csv"
test_url = "https://raw.githubusercontent.com/Yartyjung/AI-Builders-2024/main/dataset/test.csv"

# Load dataset
train_df = pd.read_csv(train_url,sep=",")
eval_df = pd.read_csv(validation_url,sep=",")
test_df = pd.read_csv(test_url,sep=",")
label2id = {"fake":0,"real":1}

#label to id func
def labelToid(dataframe1,dataframe2,dataframe3):
  dataframe1['label'] = dataframe1['label'].map(label2id)
  dataframe2['label'] = dataframe2['label'].map(label2id)
  dataframe3['label'] = dataframe3['label'].map(label2id)

labelToid(train_df,eval_df,test_df)

# Tokenize datasets
def tokenize_function(dataset_name):
    return tokenizer(dataset_name['text'], padding=True,truncation=True)

dataset_dict = DatasetDict({
    "train":Dataset.from_pandas(train_df),
    "eval":Dataset.from_pandas(eval_df),
    "test":Dataset.from_pandas(test_df)
})
# dataset_dict["train"][:10]
train_dataset = dataset_dict['train'].map(tokenize_function)
eval_dataset = dataset_dict['eval'].map(tokenize_function)
test_dataset = dataset_dict['test'].map(tokenize_function)

# Prepare data loaders
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=data_collator)

#metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    clf_metrics = evaluate.combine(["accuracy", "f1", "precision","recall"])
    return clf_metrics.compute(predictions, labels)
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    logging_dir='./logs',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

Map:   0%|          | 0/3975 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1325 [00:00<?, ? examples/s]

Map:   0%|          | 0/1326 [00:00<?, ? examples/s]