<a href="https://colab.research.google.com/github/adnan855570/LLMs_Training/blob/main/roberta_balanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets
!pip install -q transformers[torch]
!pip install -q accelerate -U
!pip install -q imblearn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load the dataset
dataset_path = '/content/preprocessed_combined_file (1).xlsx'
data = pd.read_excel(dataset_path)

# Step 2: Convert text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(data['Tweet']).toarray()
y = data['Tag']

# Step 3: Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Step 4: Create a new DataFrame based on SMOTE-resampled data
balanced_data = pd.DataFrame(X_res, columns=vectorizer.get_feature_names_out())
balanced_data['Tweet'] = vectorizer.inverse_transform(X_res)
balanced_data['Tweet'] = balanced_data['Tweet'].apply(lambda x: ' '.join(x))
balanced_data['Tag'] = y_res

# Verify the new balanced data has an equal number of classes
print("Balanced data label distribution:")
print(balanced_data['Tag'].value_counts())

Using device: cuda
Balanced data label distribution:
Tag
1    8430
0    8430
Name: count, dtype: int64


In [None]:
# Step 5: Split the balanced data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    balanced_data['Tweet'].tolist(), balanced_data['Tag'].tolist(), test_size=0.2, random_state=42)

# Step 6: Tokenize the text
tokenizer = AutoTokenizer.from_pretrained("urduhack/roberta-urdu-small")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Step 7: Create Dataset objects
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels  # Ensure labels are in list format
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels  # Ensure labels are in list format
})

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Step 8: Define the model
model = AutoModelForSequenceClassification.from_pretrained("urduhack/roberta-urdu-small", num_labels=2)
model.to(device)

# Step 9: Define the compute metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Step 10: Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

pytorch_model.bin:   0%|          | 0.00/507M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at urduhack/roberta-urdu-small and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 11: Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics
)

# Step 12: Train the model
trainer.train()

# Step 13: Evaluate the model
eval_results = trainer.evaluate()

print("Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value}")


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4827,0.590908,0.765421,0.740911,0.853585,0.654514
2,0.262,0.599442,0.826512,0.833096,0.821609,0.844907
3,0.3299,0.663105,0.823547,0.838007,0.79126,0.890625
4,0.3939,0.761806,0.793891,0.820274,0.741468,0.917824
5,0.2257,0.691974,0.833333,0.84169,0.819978,0.864583


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4827,0.590908,0.765421,0.740911,0.853585,0.654514
2,0.262,0.599442,0.826512,0.833096,0.821609,0.844907
3,0.3299,0.663105,0.823547,0.838007,0.79126,0.890625
4,0.3939,0.761806,0.793891,0.820274,0.741468,0.917824
5,0.2257,0.691974,0.833333,0.84169,0.819978,0.864583
6,0.1725,0.682362,0.832147,0.836982,0.833142,0.840856
7,0.3912,0.676845,0.826512,0.82235,0.865176,0.783565
8,0.122,0.676161,0.832444,0.842399,0.813139,0.873843
9,0.2484,0.686255,0.79745,0.801511,0.80502,0.798032
10,0.5577,0.629919,0.627224,0.729736,0.580568,0.98206


Evaluation results:
eval_loss: 0.5909075140953064
eval_accuracy: 0.7654211150652431
eval_f1: 0.7409105797576154
eval_precision: 0.8535849056603774
eval_recall: 0.6545138888888888
eval_runtime: 18.6287
eval_samples_per_second: 181.011
eval_steps_per_second: 22.653
epoch: 15.0


In [None]:
# Function to get random samples
def get_random_samples(data, n=5):
    random_samples = data.sample(n)
    return random_samples['Tweet'].tolist()
# Function to predict on unseen examples
def predict_unseen_examples(unseen_texts, model, tokenizer, device):
    model.eval()  # Set model to evaluation mode
    inputs = tokenizer(unseen_texts, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    return predictions.cpu().numpy()

# Get random samples
unseen_texts = get_random_samples(data)

# Get predictions for the unseen examples
unseen_predictions = predict_unseen_examples(unseen_texts, model, tokenizer, device)

# Print the results
for text, prediction in zip(unseen_texts, unseen_predictions):
    print(f"Text: {text}\nPrediction: {prediction}\n")

Text: جب تک ان کے گھر کی بیٹیوں اور بیویوں کی ویڈیوز بازار میں فروخت نہیں ہو گی یہ بے شرم باز نہیں آہیں گے
Prediction: 0

Text: کامران شاہد کی پہلی پہچاناسے دنیا نیوز میں ملازمت بھی بابرہ شریف کی سفارش پہ ملی ہےمریم کے ملازم لفافے اپنا گٹر جیسا منہ سوشل میڈیا ایکٹوسٹس سے دور رکھو
Prediction: 0

Text: جب عورت اپنی اننگز کا اغاز گیراج سے کرے تو اس کی زبان طوائفوں والی ہی ہوگی اس بے شرم اور بازاری عورت کو کبھی بھی شرم نہیں آئی گی
Prediction: 1

Text: چاچا افتخار تو چھا گیا ہے
Prediction: 0

Text: کل ایک چور بھی یھی کہ رھا تھا مین تو جھاد کرتا تھا اور مال غنیمت لوٹتا تھا لوگ کھتے ھیں میں ڈاکہ ڈالتا تھا
Prediction: 1



In [None]:
# Function to predict on unseen examples
def predict_unseen_examples(unseen_texts, model, tokenizer, device):
    model.eval()  # Set model to evaluation mode
    inputs = tokenizer(unseen_texts, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)

    return predictions.cpu().numpy()

# List of unseen examples
unseen_texts = [
  '''تحریک لبیک پاکستان PS 89 ضلع ملیر کے ذمہ داران نے UC 8 کا دورا کیا UC 8 کے ذمہ داران وکارکنان سے ملاقات کی اور جلد سے جلد کمیٹی مکمل کرنے کی ہدایت جاری کی
'''
]

# Ensure the model is on the correct device
model.to(device)

# Get predictions for the unseen examples
unseen_predictions = predict_unseen_examples(unseen_texts, model, tokenizer, device)

# Print the results
for text, prediction in zip(unseen_texts, unseen_predictions):
    print(f"Text: {text}\nPrediction: {prediction}\n")

Text: تحریک لبیک پاکستان PS 89 ضلع ملیر کے ذمہ داران نے UC 8 کا دورا کیا UC 8 کے ذمہ داران وکارکنان سے ملاقات کی اور جلد سے جلد کمیٹی مکمل کرنے کی ہدایت جاری کی

Prediction: 0



In [None]:
# prompt: mount drive and save this model to drive

from google.colab import drive
drive.mount('/content/drive')

# Save the model to Google Drive
model_path = "/content/drive/MyDrive/Roberta-model.pt"
torch.save(model.state_dict(), model_path)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
