In [5]:
# import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [6]:
# read data
df_train = pd.read_csv('/content/drive/MyDrive/Gemini_Thi_AI/demo twitter/twitter_training.csv')
df_val = pd.read_csv('/content/drive/MyDrive/Gemini_Thi_AI/demo twitter/twitter_validation.csv')
# concatenate data
df = pd.concat([df_train, df_val], ignore_index=False)


# Remove unnecessary columns
columns_to_drop = ['2401', '3364', 'Facebook', 'Irrelevant',
                   'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣']
df = df.drop(columns_to_drop, axis=1)

# Rename columns
df = df.rename(columns={'im getting on borderlands and i will murder you all ,': 'Tweet', 'Positive': 'Sentiment', 'Borderlands':"Branch"})


# remove missing values
df.dropna(inplace=True)
# check missing values
df.isnull().sum()


# remove duplicate values
remove_duplicates = df.drop_duplicates()
df = remove_duplicates
# check duplicate values
df.duplicated().sum()

0

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm


# Chuẩn bị dữ liệu
tweets = df['Tweet'].values
labels = df['Sentiment'].values

# Ánh xạ nhãn thành số
label_dict = {'Positive': 0, 'Negative': 1, 'Neutral': 2, 'Irrelevant': 3}
labels = np.array([label_dict[label] for label in labels])

# Chia tập dữ liệu
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# Khởi tạo tokenizer và mô hình
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Tokenize và chuẩn bị dữ liệu
def tokenize_and_encode(texts, max_length=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_and_encode(train_texts)
val_encodings = tokenize_and_encode(val_texts)

# Tạo DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels))

batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Chuẩn bị optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Huấn luyện mô hình
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print('-' * 40)

    # Training
    model.train()
    train_loss = 0
    train_steps = 0
    train_preds = []
    train_true = []

    for batch in tqdm(train_dataloader, desc="Training"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits

        train_loss += loss.item()
        loss.backward()
        optimizer.step()

        train_steps += 1
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        train_preds.extend(preds)
        train_true.extend(inputs['labels'].detach().cpu().numpy())

    avg_train_loss = train_loss / train_steps
    train_accuracy = (np.array(train_preds) == np.array(train_true)).mean()

    print(f"Average training loss: {avg_train_loss:.4f}")
    print(f"Training accuracy: {train_accuracy:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    val_steps = 0
    val_preds = []
    val_true = []

    for batch in tqdm(val_dataloader, desc="Validation"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs.loss
        logits = outputs.logits

        val_loss += loss.item()
        val_steps += 1
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        val_preds.extend(preds)
        val_true.extend(inputs['labels'].detach().cpu().numpy())

    avg_val_loss = val_loss / val_steps
    val_accuracy = (np.array(val_preds) == np.array(val_true)).mean()

    print(f"Average validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")

    print("\nClassification Report (Validation):")
    print(classification_report(val_true, val_preds, target_names=list(label_dict.keys())))
    print("\n")




# Sau khi huấn luyện xong
import os

# Tạo thư mục trong Google Drive để lưu mô hình
save_path = '/content/drive/MyDrive/Gemini_Thi_AI/demo twitter/save_model'
os.makedirs(save_path, exist_ok=True)

# Lưu mô hình
print("Saving model...")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")

Epoch 1/3
----------------------------------------


Training: 100%|██████████| 1774/1774 [19:10<00:00,  1.54it/s]


Average training loss: 0.8539
Training accuracy: 0.6613


Validation: 100%|██████████| 444/444 [01:41<00:00,  4.40it/s]


Average validation loss: 0.5595
Validation accuracy: 0.7912

Classification Report (Validation):
              precision    recall  f1-score   support

    Positive       0.84      0.75      0.79      3887
    Negative       0.81      0.88      0.85      4296
     Neutral       0.76      0.76      0.76      3498
  Irrelevant       0.73      0.73      0.73      2511

    accuracy                           0.79     14192
   macro avg       0.79      0.78      0.78     14192
weighted avg       0.79      0.79      0.79     14192



Epoch 2/3
----------------------------------------


Training: 100%|██████████| 1774/1774 [19:18<00:00,  1.53it/s]


Average training loss: 0.3571
Training accuracy: 0.8721


Validation: 100%|██████████| 444/444 [01:40<00:00,  4.43it/s]


Average validation loss: 0.2960
Validation accuracy: 0.8952

Classification Report (Validation):
              precision    recall  f1-score   support

    Positive       0.91      0.88      0.90      3887
    Negative       0.90      0.91      0.90      4296
     Neutral       0.90      0.89      0.89      3498
  Irrelevant       0.87      0.89      0.88      2511

    accuracy                           0.90     14192
   macro avg       0.89      0.89      0.89     14192
weighted avg       0.90      0.90      0.90     14192



Epoch 3/3
----------------------------------------


Training: 100%|██████████| 1774/1774 [19:17<00:00,  1.53it/s]


Average training loss: 0.1557
Training accuracy: 0.9436


Validation: 100%|██████████| 444/444 [01:39<00:00,  4.44it/s]


Average validation loss: 0.2903
Validation accuracy: 0.9068

Classification Report (Validation):
              precision    recall  f1-score   support

    Positive       0.89      0.93      0.91      3887
    Negative       0.88      0.95      0.91      4296
     Neutral       0.95      0.86      0.91      3498
  Irrelevant       0.93      0.87      0.90      2511

    accuracy                           0.91     14192
   macro avg       0.91      0.90      0.91     14192
weighted avg       0.91      0.91      0.91     14192



Saving model...
Model saved to /content/drive/MyDrive/Gemini_Thi_AI/demo twitter/save_model


In [1]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Đường dẫn đến mô hình đã lưu
model_path = '/content/drive/MyDrive/Gemini_Thi_AI/demo twitter/save_model'  # Điều chỉnh đường dẫn nếu cần

# Tải lại mô hình và tokenizer
loaded_model = BertForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = BertTokenizer.from_pretrained(model_path)

# Chuyển mô hình sang GPU nếu có
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_model.to(device)

print("Model loaded successfully")

Model loaded successfully


In [3]:
# Định nghĩa hàm dự đoán

def predict_sentiment(text):
    # Tokenize input
    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Dự đoán
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Ánh xạ ngược lại từ số thành nhãn
    label_dict = {0: 'Positive', 1: 'Negative', 2: 'Neutral', 3: 'Irrelevant'}
    return label_dict[predicted_class]

In [4]:
# Thử nghiệm mô hình

# Các câu mẫu để thử nghiệm
sample_texts = [
    "I love this product! It's amazing!",
    "This is the worst experience I've ever had.",
    "The weather is nice today.",
    "I don't have any strong feelings about this.",
    "Breaking news: Major event happened in the city center."
]

# Thử nghiệm mô hình
for text in sample_texts:
    sentiment = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Predicted sentiment: {sentiment}")
    print("-" * 50)

Text: I love this product! It's amazing!
Predicted sentiment: Positive
--------------------------------------------------
Text: This is the worst experience I've ever had.
Predicted sentiment: Negative
--------------------------------------------------
Text: The weather is nice today.
Predicted sentiment: Neutral
--------------------------------------------------
Text: I don't have any strong feelings about this.
Predicted sentiment: Negative
--------------------------------------------------
Text: Breaking news: Major event happened in the city center.
Predicted sentiment: Irrelevant
--------------------------------------------------
