In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import re

# Step 1: Load and preprocess the data
file_path = "/content/Avi_Anxietyhelp_with_labels.csv"  # Update with actual Colab path
df = pd.read_csv(file_path)
df = df.dropna(subset=['User Comment', 'Label'])

# Convert Label to integer
df['Label'] = df['Label'].astype(int)

# Ensure balanced split of labels (equal number of 0s and 1s in both train and test sets)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Label'])

# Load LLaMA 2 model and tokenizer
model_name = "model_name = "mistralai/Mistral-7B-Instruct""  # Alternative: "mistralai/Mistral-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto")

# Define the inference pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_prompt(user_comment, author_reply):
    """Generate a few-shot prompt for Llama-2."""
    prompt = "Determine if the following user comment helped the author based on the author's reply.\n"
    prompt += f"User Comment: {user_comment}\n"
    prompt += f"Author Reply: {author_reply}\n"
    prompt += "Label: "
    return prompt

def get_model_prediction(user_comment, author_reply):
    """Query Llama-2 model and return the predicted label."""
    prompt = generate_prompt(user_comment, author_reply)
    output = pipe(prompt, max_new_tokens=50, truncation=True,do_sample=True, temperature=0.7)
    generated_text = output[0]['generated_text'].strip()

    # Extract numbers (0 or 1) using regex
    match = re.search(r'\b[01]\b', generated_text)

    return int(match.group()) if match else 0  # Default to 0 if no valid number is found

# Predict labels for the training dataset
train_df["Predicted Label"] = train_df.apply(lambda row: get_model_prediction(row["User Comment"], row["Author Reply"]), axis=1)

# Predict labels for the test dataset
test_df["Predicted Label"] = test_df.apply(lambda row: get_model_prediction(row["User Comment"], row["Author Reply"]), axis=1)

# Save predictions
train_df.to_csv("train_predictions_with_labels.csv", index=False)
test_df.to_csv("test_predictions_with_labels.csv", index=False)

# Calculate evaluation metrics
accuracy = accuracy_score(test_df['Label'], test_df['Predicted Label'])
precision = precision_score(test_df['Label'], test_df['Predicted Label'])
recall = recall_score(test_df['Label'], test_df['Predicted Label'])
f1 = f1_score(test_df['Label'], test_df['Predicted Label'])
report = classification_report(test_df['Label'], test_df['Predicted Label'])

# Print evaluation metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Classification Report:")
print(report)

# Print the number of rows in the output files
print(f"Total rows in training file: {train_df.shape[0]}")
print(f"Total rows in test file: {test_df.shape[0]}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 

In [4]:


from huggingface_hub import login


login(token="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")