In [2]:
import json
import random
import os
import pandas as pd
from IPython import get_ipython
from IPython.display import display
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizerFast, BertModel
from datasets import load, load_dataset
from sklearn.model_selection import train_test_split
import evaluate
from transformers import pipeline
import glob
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
from datasets import Dataset

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
dataset = load_dataset('imdb')
dataset['train'] = dataset['train'].select(range(10000))
dataset['test'] = dataset['test'].select(range(10000))

In [None]:
# Convert the dataset into pandas DataFrames
X_train = pd.DataFrame(dataset['train'])
X_test = pd.DataFrame(dataset['test'])

# Split the training dataset into training and validation sets
train_df, val_df = train_test_split(X_train, test_size=0.2, random_state=42)

# Display the sizes of the splits
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(X_test)}")

In [None]:


# Clean text function
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    return text

# Apply cleaning
train_df['text'] = train_df['text'].apply(clean_text)
val_df['text'] = val_df['text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

# Tokenize text
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# Create a directory to store model and results
output_dir = "my_model_results" 
os.makedirs(output_dir, exist_ok=True)

# Set repo name
repo_name = "Sentiment"

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    output_dir=repo_name,
    push_to_hub=True,
)

# Define or load your model here
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

trainer.push_to_hub()



# Save the model
trainer.save_model()  # Saves to the output_dir specified in training_args

# Save the tokenizer (optional, but recommended)
tokenizer.save_pretrained(output_dir)