In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers datasets evaluate accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd

# Örnek yol: /content/drive/MyDrive/social-media-sentiment-trend/data/processed/train_cleaned.csv
train_path = "/content/drive/MyDrive/processed/train_cleaned.csv"
test_path = "/content/drive/MyDrive/processed/test_cleaned.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.head()) # Verinin geldiğini teyit et

                                          clean_text  label  \
0                                         would cost      1   
1  used get emails prepurchase snack time check g...      1   
2  flight cancelled flightlations one due weather...      0   
3    frustrated idea great crew thanks happycustomer      2   
4               narrowly made standbylots snags trip      0   

               tweet_created  
0  2015-02-23 06:16:49 -0800  
1  2015-02-20 13:49:28 -0800  
2  2015-02-17 12:12:46 -0800  
3  2015-02-20 17:38:13 -0800  
4  2015-02-24 10:42:33 -0800  


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

# BERT'in standart modelini ve tokenizer'ını seçiyoruz
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 1. Önce boş (NaN) değerleri temizleyelim
train_df = train_df.dropna(subset=['clean_text'])
test_df = test_df.dropna(subset=['clean_text'])

# 2. Tüm değerlerin kesinlikle string (metin) olduğundan emin olalım
train_df['clean_text'] = train_df['clean_text'].astype(str)
test_df['clean_text'] = test_df['clean_text'].astype(str)

# 3. Boşluktan ibaret olan satırları da temizleyelim
train_df = train_df[train_df['clean_text'].str.strip() != ""]
test_df = test_df[test_df['clean_text'].str.strip() != ""]
# Pandas verilerini Hugging Face Dataset formatına alalım
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
def tokenize_function(examples):
    # Truncation: Uzun metinleri keser, Padding: Kısa metinleri doldurur
    return tokenizer(examples["clean_text"], truncation=True, padding="max_length", max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/11697 [00:00<?, ? examples/s]

Map:   0%|          | 0/2921 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from transformers import EarlyStoppingCallback
# 3 farklı duygu (Negatif, Nötr, Pozitif) için modeli yüklüyoruz
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

# Başarıyı ölçmek için Accuracy metriğini ekleyelim
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Eğitim Parametreleri
from transformers import TrainingArguments

# Eğitim Parametreleri (WEIGHT DECAY IS INCREASED (0.01 0.1) AND ADDED EARLY STOPPING )
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/social-media-sentiment-trend/models/checkpoints",
    eval_strategy="epoch",      # 'evaluation_strategy' yerine 'eval_strategy'
    save_strategy="epoch",      # Kayıt stratejisi de genellikle epoch bazlı istenir
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    logging_steps=100,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5396,0.513727,0.790483
2,0.4462,0.506104,0.803834
3,0.3781,0.517406,0.800753


TrainOutput(global_step=2196, training_loss=0.47692362814870254, metrics={'train_runtime': 825.0851, 'train_samples_per_second': 42.53, 'train_steps_per_second': 2.662, 'total_flos': 2308228235373312.0, 'train_loss': 0.47692362814870254, 'epoch': 3.0})

In [None]:
model_save_path = "/content/drive/MyDrive/social-media-sentiment-trend/models/final_bert_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/content/drive/MyDrive/social-media-sentiment-trend/models/final_bert_model/tokenizer_config.json',
 '/content/drive/MyDrive/social-media-sentiment-trend/models/final_bert_model/special_tokens_map.json',
 '/content/drive/MyDrive/social-media-sentiment-trend/models/final_bert_model/vocab.txt',
 '/content/drive/MyDrive/social-media-sentiment-trend/models/final_bert_model/added_tokens.json',
 '/content/drive/MyDrive/social-media-sentiment-trend/models/final_bert_model/tokenizer.json')