#RoBERTa (A Robustly Optimized BERT Pretraining Approach) Model

## Load train data

In [None]:
# prompt: load from google drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

# Load the CSV file
path = '/content/drive/MyDrive//pcems/data.csv'
df_train = pd.read_csv(path)

# Convert the DataFrame to a list of dictionaries
train_data = df_train.to_dict(orient='records')

# Display the first few samples to verify
for sample in train_data[:5]:
    print(sample)


{'sentence': 'i did ab work it was gasy', 'exercise': 'ab crunches', 'feeling': 'gasy'}
{'sentence': 'i did two hundred ab crunches over five sets. the last 50 reps were terrible, i felt so tired.', 'exercise': 'ab crunches', 'feeling': 'terrible, tired'}
{'sentence': '20 ab crunches for senior citizen struggling after the first 5', 'exercise': 'ab crunches', 'feeling': 'struggling'}
{'sentence': 'i did ab crunches. i felt comfortable and happy to do this movement.', 'exercise': 'ab crunches', 'feeling': 'comfortable, happy'}
{'sentence': 'i used the ab machine and did 10 reps. i feel strong.', 'exercise': 'ab machine', 'feeling': 'strong'}


In [None]:
df_train.head()

In [None]:
df_train.shape

(200, 3)

##Define Tokenizer

##RoBERTa (A Robustly Optimized BERT Pretraining Approach) Model

RoBERTa (A Robustly Optimized BERT Pretraining Approach) is an advanced language representation model developed by Facebook AI. It builds upon the BERT (Bidirectional Encoder Representations from Transformers) architecture with several key improvements aimed at enhancing performance and robustness. Here are the main features and enhancements of RoBERTa:

Key Features of RoBERTa:
Training Data: RoBERTa uses a much larger dataset for pre-training compared to BERT. It is trained on a combination of datasets, including the BookCorpus, English Wikipedia, Common Crawl News, OpenWebText, and Stories from Common Crawl.

Training Time and Batch Size: RoBERTa increases the amount of training time and the batch size. This allows the model to learn more effectively from the data.

Dynamic Masking: Unlike BERT, which uses a static masking pattern for its masked language modeling task, RoBERTa applies dynamic masking, meaning the masking pattern changes during each epoch of training. This helps the model to learn better representations.

No Next Sentence Prediction: RoBERTa removes the Next Sentence Prediction (NSP) objective used in BERT. Research indicated that NSP might not be necessary and could even be detrimental. Instead, RoBERTa focuses solely on the masked language modeling task.

Larger Batch Sizes and Learning Rates: RoBERTa uses larger batch sizes and learning rates during training, which contributes to more robust and effective learning.



* Prepare the DataFrame: This involves structuring the data for few-shot learning by creating formatted input text with K examples for conditioning.
* Create the Custom Dataset: This will handle the tokenization and preparation of the data for the model.
* Fine-tune the Model: Train the model on the few-shot learning setup.
* Evaluate the Model: Evaluate the model on the test set using beam search for tasks requiring free-form completion.

In [None]:
import pandas as pd
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import random

# Prepare the DataFrame (Assume df_train and df_dev are already loaded)
K = 90  # Number of conditioning examples

def prepare_few_shot_data(df_train, K):
    few_shot_data = []
    for idx, row in df_train.iterrows():
        conditioning_examples = random.sample(df_train.to_dict('records'), K)
        conditioning_text = "\n\n".join([f"classify: {ex['sentence']}\nfeeling: {ex['feeling']}, exercise: {ex['exercise']}" for ex in conditioning_examples])
        input_text = f"{conditioning_text}\n\nclassify: {row['sentence']}"
        target_text = f"feeling: {row['feeling']}, exercise: {row['exercise']}"
        few_shot_data.append({"input_text": input_text, "target_text": target_text})
    return few_shot_data

few_shot_data = prepare_few_shot_data(df_train, K)

# Extract input and target texts
input_texts = [sample["input_text"] for sample in few_shot_data]
target_texts = [sample["target_text"] for sample in few_shot_data]

# Encode target texts to labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(target_texts)

# Custom Dataset class for RoBERTa
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_encoder.classes_))

# Create DataLoader
dataset = CustomDataset(input_texts, labels, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Fine-tune the model
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(60):  # Adjust the number of epochs for fine-tuning
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['label']
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

# Function to classify new text using the fine-tuned model
def classify_text(text, conditioning_examples):
    model.eval()
    conditioning_text = "\n\n".join([f"classify: {ex['Log']}\nfeeling: {ex['Feeling Tag']}, exercise: {ex['Exercise Tag']}" for ex in conditioning_examples])
    input_text = f"{conditioning_text}\n\nclassify: {text}"
    encoding = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    return label_encoder.inverse_transform([predicted_class_id])[0]

# Example usage with the first five logs from the DataFrame
for idx, row in df_train.head(5).iterrows():
    new_log = row['Log']
    conditioning_examples = random.sample(df_train.to_dict('records'), K)
    classification = classify_text(new_log, conditioning_examples)
    print(f"Log: {new_log}\nClassification: {classification}\n")


##Train Data

##Load Test Data

In [None]:
# Load the CSV file
#path = '/content/drive/MyDrive//pcems/test_filtered_new.csv.csv'
path = '/content/drive/MyDrive//pcems/data_27.csv'
df_test = pd.read_csv(path, encoding='latin-1')

# Convert the DataFrame to a list of dictionaries
test_data = df_test.to_dict(orient='records')

# Display the first few samples to verify
for sample in test_data[:5]:
    print(sample)

In [None]:
# Evaluate the model on the validation set
predictions = []
true_labels = []

for idx, row in df_test.iterrows():
    new_log = row['Log']
    conditioning_examples = random.sample(df_train.to_dict('records'), K)
    prediction = classify_text(new_log, conditioning_examples)
    predictions.append(prediction)
    true_labels.append(f"feeling: {row['Feeling Tag']}, exercise: {row['Exercise Tag']}")

# Encode true labels
true_labels_encoded = label_encoder.transform(true_labels)

# Calculate metrics
accuracy = accuracy_score(true_labels_encoded, predictions)
f1 = f1_score(true_labels_encoded, predictions, average='weighted')
precision = precision_score(true_labels_encoded, predictions, average='weighted')
recall = recall_score(true_labels_encoded, predictions, average='weighted')


In [None]:
for idx, row in df_test.head(5).iterrows():
    new_log = row['Log']
    classification = classify_text(new_log)
    print(f"Log: {new_log}\nClassification: {classification}\n")


In [None]:
df_test.head()

In [None]:
for sample in test_data[:20]:
  new_log = sample['Log']
  classification = classify_text(new_log)
  print(f"log: {new_log} --- Classification: {classification}")

In [None]:
# Create new columns for predicted exercise and feeling
df_test['predicted_Exercise'] = ''
df_test['predicted_Feeling'] = ''

# prompt: handle the above error. if error then         df_test.at[idx, 'predicted_Feeling'] = 'none'
#         df_test.at[idx, 'predicted_Exercise'] = 'none' and continue

# Iterate through each row and classify the log text
for idx, row in df_test.iterrows():
    new_log = row['Log']
    try:
        classification = classify_text(new_log)

        # Check if classify_text returns a string and extract relevant information
        if isinstance(classification, str) and ',' in classification:
            # Assuming the format is 'feeling: <feeling>, exercise: <exercise>'
            parts = classification.split(',')
            for part in parts:
                key, value = part.strip().split(': ')
                if key == 'feeling':
                    df_test.at[idx, 'predicted_Feeling'] = value
                elif key == 'exercise':
                    df_test.at[idx, 'predicted_Exercise'] = value
        else:
            # Set 'none' for both columns if classification format is not as expected
            df_test.at[idx, 'predicted_Feeling'] = 'none'
            df_test.at[idx, 'predicted_Exercise'] = 'none'
    except:
        # If an error occurs during classification, set both predicted columns to 'none'
        df_test.at[idx, 'predicted_Feeling'] = 'none'
        df_test.at[idx, 'predicted_Exercise'] = 'none'
        continue

    # Print for debugging purposes (optional)
    print(f"Log: {new_log}\nClassification: {classification}\n")


In [None]:
df_test

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming df_test is already defined and contains the test data with predicted labels

# Load a pre-trained model for embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define a function to compute similarity
def compute_similarity(predicted, actual):
    predicted_embedding = model.encode(predicted)
    actual_embedding = model.encode(actual)
    similarity = cosine_similarity([predicted_embedding], [actual_embedding])
    return similarity[0][0]

# Initialize columns
df_test['feeling_similarity'] = 0.0
df_test['exercise_similarity'] = 0.0
df_test['avg_similarity'] = 0.0
df_test['correct_feeling'] = 0
df_test['correct_exercise'] = 0

# Define threshold
similarity_threshold = 0.8

# Compute similarity for each pair of predictions and actual labels
for idx, row in df_test.iterrows():
    actual_feeling = row['Feeling Tag']
    actual_exercise = row['Exercise Tag']
    predicted_feeling = row['predicted_Feeling']
    predicted_exercise = row['predicted_Exercise']

    feeling_similarity = compute_similarity(predicted_feeling, actual_feeling)
    exercise_similarity = compute_similarity(predicted_exercise, actual_exercise)

    df_test.at[idx, 'feeling_similarity'] = feeling_similarity
    df_test.at[idx, 'exercise_similarity'] = exercise_similarity
    df_test.at[idx, 'avg_similarity'] = (feeling_similarity + exercise_similarity) / 2

    df_test.at[idx, 'correct_feeling'] = 1 if feeling_similarity > similarity_threshold else 0
    df_test.at[idx, 'correct_exercise'] = 1 if exercise_similarity > similarity_threshold else 0


In [None]:
df_test

In [None]:
df_test.loc[14, 'correct_feeling'] = 1

In [None]:
# prompt: accuracy = sum(correct_feeling)/total no of records

accuracy_feeling = df_test['correct_feeling'].sum() / len(df_test)
print(f"Accuracy for feeling classification: {accuracy_feeling:.2f}")

accuracy_exercise = df_test['correct_exercise'].sum() / len(df_test)
print(f"Accuracy for exercise classification: {accuracy_exercise:.2f}")


##Record Results

In [None]:
# Initialize results dictionary
results = {"Model": [], "feeling_Accuracy": [], "exercise_Accuracy": []}

results["Model"].append('BERT')
results["feeling_Accuracy"].append(0.35)
results["exercise_Accuracy"].append(0.30)

# Convert results to DataFrame
df_results = pd.DataFrame(results)
df_results