#BERT (Bidirectional Encoder Representations from Transformers) Model

In [None]:
pip install datasets

In [None]:
pip install langchain_community

In [None]:
!pip install sentence_transformers

## Load train data

In [None]:
# prompt: load from google drive

from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd

# Load the CSV file
#path = '/content/drive/MyDrive//pcems/trainMajorityVoteData1.csv'
path = '/content/drive/MyDrive//pcems/data.csv'
df_train = pd.read_csv(path)

# Convert the DataFrame to a list of dictionaries
train_data = df_train.to_dict(orient='records')

# Display the first few samples to verify
for sample in train_data[:5]:
    print(sample)


In [None]:
df_train.head()

In [None]:
df_train.shape

##Define Tokenizer

##BERT (Bidirectional Encoder Representations from Transformers) Model

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model_name = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


In [None]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Prepare the DataFrame (Assume df_train is already loaded)
formatted_data = [
    {
        "input_text": f"classify: {row['sentence']}",
        "target_text": f"feeling: {row['feeling']}, exercise: {row['exercise']}"
    }
    for idx, row in df_train.iterrows()
]

# Extract input and target texts
input_texts = [sample["input_text"] for sample in formatted_data]
target_texts = [sample["target_text"] for sample in formatted_data]

# Encode target texts to labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(target_texts)

# Custom Dataset class for BERT
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Create DataLoader
dataset = CustomDataset(input_texts, labels, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize a list to store the loss values for each epoch
loss_values = []

# Fine-tune the model
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(70):  # Adjust the number of epochs for fine-tuning
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['label']
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    loss_values.append(avg_loss)  # Append the average loss for this epoch
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

# Plot the loss curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 71), loss_values, label='Training Loss', color='blue', marker='o')
plt.title('Loss Curve Over 70 Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.legend()
plt.show()
# Function to classify new text using the fine-tuned model

def classify_text(text):
    model.eval()
    input_text = f"classify: {text}"
    encoding = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    return label_encoder.inverse_transform([predicted_class_id])[0]

# Example usage with the first five logs from the DataFrame
for idx, row in df_train.head(5).iterrows():
    new_log = row['sentence']
    classification = classify_text(new_log)
    print(f"Log: {new_log}\nClassification: {classification}\n")


##Train Data

##Load Test Data

In [None]:
# Load the CSV file
#path = '/content/drive/MyDrive//pcems/testMajorityVoteData1.csv'
#path = '/content/drive/MyDrive//pcems/strat_test_set.csv'
path = '/content/drive/MyDrive//pcems/data_27.csv'

df_test = pd.read_csv(path, encoding='latin-1')

# Convert the DataFrame to a list of dictionaries
test_data = df_test.to_dict(orient='records')

# Display the first few samples to verify
for sample in test_data[:5]:
    print(sample)

In [None]:
for idx, row in df_test.head(5).iterrows():
    new_log = row['sentence']
    classification = classify_text(new_log)
    print(f"sentence: {new_log}\nClassification: {classification}\n")


In [None]:
df_test.head()

In [None]:
for sample in test_data[:20]:
  new_log = sample['sentence']
  classification = classify_text(new_log)
  print(f"sentence: {new_log} --- Classification: {classification}")

In [None]:
# Create new columns for predicted exercise and feeling
df_test['predicted_Exercise'] = ''
df_test['predicted_Feeling'] = ''

# prompt: handle the above error. if error then         df_test.at[idx, 'predicted_Feeling'] = 'none'
#         df_test.at[idx, 'predicted_Exercise'] = 'none' and continue

# Iterate through each row and classify the log text
for idx, row in df_test.iterrows():
    new_log = row['sentence']
    try:
        classification = classify_text(new_log)

        # Check if classify_text returns a string and extract relevant information
        if isinstance(classification, str) and ',' in classification:
            # Assuming the format is 'feeling: <feeling>, exercise: <exercise>'
            parts = classification.split(',')
            for part in parts:
                key, value = part.strip().split(': ')
                if key == 'feeling':
                    df_test.at[idx, 'predicted_Feeling'] = value
                elif key == 'exercise':
                    df_test.at[idx, 'predicted_Exercise'] = value
        else:
            # Set 'none' for both columns if classification format is not as expected
            df_test.at[idx, 'predicted_Feeling'] = 'none'
            df_test.at[idx, 'predicted_Exercise'] = 'none'
    except:
        # If an error occurs during classification, set both predicted columns to 'none'
        df_test.at[idx, 'predicted_Feeling'] = 'none'
        df_test.at[idx, 'predicted_Exercise'] = 'none'
        continue

    # Print for debugging purposes (optional)
    print(f"sentence: {new_log}\nClassification: {classification}\n")


In [None]:
# Create new columns for predicted exercise and feeling
df_test['predicted_Exercise'] = ''
df_test['predicted_Feeling'] = ''

# Iterate through each row and classify the log text
for idx, row in df_test.iterrows():
    new_log = row['sentence']
    try:
        classification = classify_text(new_log)

        # Ensure classification is a string and contains a comma (expected format)
        if isinstance(classification, str) and ',' in classification:
            # Strip any extra spaces or periods and split the parts
            parts = [part.strip().rstrip('.') for part in classification.split(',')]

            # Loop through the parts to find the correct key-value pairs
            for part in parts:
                if ': ' in part:
                    key, value = part.split(': ', 1)
                    key, value = key.strip(), value.strip()  # Ensure no extra spaces
                    if key == 'feeling':
                        df_test.at[idx, 'predicted_Feeling'] = value
                    elif key == 'exercise':
                        df_test.at[idx, 'predicted_Exercise'] = value
        else:
            # If the classification format is not as expected, set 'none'
            df_test.at[idx, 'predicted_Feeling'] = 'none'
            df_test.at[idx, 'predicted_Exercise'] = 'none'

    except Exception as e:
        # If an error occurs, set both predicted columns to 'none' and continue
        df_test.at[idx, 'predicted_Feeling'] = 'none'
        df_test.at[idx, 'predicted_Exercise'] = 'none'
        print(f"Error: {e} for sentence: {new_log}")  # Optional: log the error
        continue

    # Print for debugging purposes (optional)
    print(f"sentence: {new_log}\nClassification: {classification}\n")


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming df_test is already defined and contains the test data with predicted labels

# Load a pre-trained model for embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define a function to compute similarity
def compute_similarity(predicted, actual):
    predicted_embedding = model.encode(predicted)
    actual_embedding = model.encode(actual)
    similarity = cosine_similarity([predicted_embedding], [actual_embedding])
    return similarity[0][0]

# Initialize columns
df_test['feeling_similarity'] = 0.0
df_test['exercise_similarity'] = 0.0
df_test['avg_similarity'] = 0.0
df_test['correct_feeling'] = 0
df_test['correct_exercise'] = 0

# Define threshold
similarity_threshold = 0.8

# Compute similarity for each pair of predictions and actual labels
for idx, row in df_test.iterrows():
    actual_feeling = row['feeling']
    actual_exercise = row['exercise']
    predicted_feeling = row['predicted_Feeling']
    predicted_exercise = row['predicted_Exercise']

    feeling_similarity = compute_similarity(predicted_feeling, actual_feeling)
    exercise_similarity = compute_similarity(predicted_exercise, actual_exercise)

    df_test.at[idx, 'feeling_similarity'] = feeling_similarity
    df_test.at[idx, 'exercise_similarity'] = exercise_similarity
    df_test.at[idx, 'avg_similarity'] = (feeling_similarity + exercise_similarity) / 2

    df_test.at[idx, 'correct_feeling'] = 1 if feeling_similarity > similarity_threshold else 0
    df_test.at[idx, 'correct_exercise'] = 1 if exercise_similarity > similarity_threshold else 0

In [None]:
df_test

In [None]:
# prompt: accuracy = sum(correct_feeling)/total no of records

accuracy_feeling = df_test['correct_feeling'].sum() / len(df_test)
print(f"Accuracy for feeling classification: {accuracy_feeling:.2f}")

accuracy_exercise = df_test['correct_exercise'].sum() / len(df_test)
print(f"Accuracy for exercise classification: {accuracy_exercise:.2f}")


Accuracy for feeling classification: 0.69
Accuracy for exercise classification: 0.61
