<a href="https://colab.research.google.com/github/Yusunkim4448/570finalProject/blob/main/projectF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install the necessary libraries
import os
os.environ["WANDB_MODE"] = "disabled"

!pip install transformers torch sentence-transformers datasets

import torch
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
from datasets import Dataset  # Import Dataset from Hugging Face's datasets library
import random
import numpy as np

# Define sentences for training
sports_sentences = [
    "The striker scored a stunning goal in the last minute.",
    "The quarterback threw an interception in the final quarter.",
    "A thrilling comeback saw the team winning by two points.",
    "The team displayed great defense and controlled the pace of the game.",
]

# Define pairs of sentences for contrastive learning
examples = [InputExample(texts=[sports_sentences[i], sports_sentences[i+1]]) for i in range(0, len(sports_sentences)-1)]

# Initialize the model for sentence embeddings
simcse_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create SentencesDataset and DataLoader for training
train_dataset = SentencesDataset(examples, simcse_model)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define the contrastive learning loss
train_loss = losses.MultipleNegativesRankingLoss(simcse_model)

# Train the model
simcse_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)






Step,Training Loss


In [3]:
# Custom function to create slight paraphrases for DiffCSE
def augment_sentence(sentence):
    return sentence.replace("the", "a")  # Simple augmentation example

diffcse_examples = [
    InputExample(texts=[sentence, augment_sentence(sentence)]) for sentence in sports_sentences
]

# Train DiffCSE using augmented sentences
diffcse_dataloader = torch.utils.data.DataLoader(diffcse_examples, batch_size=2, shuffle=True)
diffcse_model = SentenceTransformer('bert-base-uncased')
diffcse_loss = losses.MultipleNegativesRankingLoss(diffcse_model)

# Training the DiffCSE model
diffcse_model.fit(train_objectives=[(diffcse_dataloader, diffcse_loss)], epochs=1, warmup_steps=100)




Step,Training Loss


In [4]:
# AI feedback integration: Adjust sentence pairs based on AI model suggestions
# Dummy function for AI feedback to simulate similarity check
def ai_feedback(sentence1, sentence2):
    # Simple example: if sentence similarity is high, keep; if low, adjust (in practice, use feedback system)
    return random.choice([True, False])

# Filter and augment sentence pairs based on AI feedback
feedback_examples = [
    InputExample(texts=[sentence1, sentence2]) for sentence1, sentence2 in zip(sports_sentences[:-1], sports_sentences[1:])
    if ai_feedback(sentence1, sentence2)
]

# Fine-tune SimCSE with AI feedback on sports domain
feedback_dataloader = torch.utils.data.DataLoader(feedback_examples, batch_size=2, shuffle=True)
feedback_loss = losses.MultipleNegativesRankingLoss(simcse_model)  # Reusing SimCSE model

simcse_model.fit(train_objectives=[(feedback_dataloader, feedback_loss)], epochs=1, warmup_steps=100)


Step,Training Loss


In [5]:
# Evaluation function to measure embedding quality using cosine similarity
def evaluate_embeddings(model, sentence_pairs):
    embeddings = model.encode([pair.texts for pair in sentence_pairs], convert_to_tensor=True)
    similarities = []
    for i in range(len(embeddings)-1):
        similarity = torch.nn.functional.cosine_similarity(embeddings[i], embeddings[i+1], dim=0)
        similarities.append(similarity.item())
    return similarities

# Prepare sample evaluation data
evaluation_data = [
    InputExample(texts=["The striker scored", "The goalkeeper made a save"]),
    InputExample(texts=["The game went to overtime", "It was a close game"]),
]

# Evaluate the baseline SimCSE, DiffCSE, and fine-tuned models
baseline_similarities = evaluate_embeddings(simcse_model, evaluation_data)
diffcse_similarities = evaluate_embeddings(diffcse_model, evaluation_data)

# Print results for model comparisons
print("Baseline SimCSE Similarities:", baseline_similarities)
print("DiffCSE Similarities:", diffcse_similarities)


Baseline SimCSE Similarities: [0.29396378993988037]
DiffCSE Similarities: [0.5759817361831665]


In [11]:
import os
import torch
from sentence_transformers import SentenceTransformer, InputExample, losses
from datasets import Dataset
from transformers import pipeline

# Disable WANDB (Weights and Biases)
os.environ["WANDB_MODE"] = "disabled"

# Install necessary libraries (run this once if required)
# !pip install transformers torch sentence-transformers datasets

# Initialize the sentence transformer model
simcse_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
feedback_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Define the sports sentences
sports_sentences = [
    "The striker scored a stunning goal in the last minute.",
    "The quarterback threw an interception in the final quarter.",
    "A thrilling comeback saw the team winning by two points.",
    "The team displayed great defense and controlled the pace of the game.",
]

# Function to compute cosine similarity between two sentence embeddings
def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0).item()

# Function for AI feedback based on similarity threshold (using SimCSE model)
def ai_feedback_refined(sentence1, sentence2, threshold=0.5):  # Lower threshold to 0.5
    # Encode the sentences to get embeddings
    embeddings1 = feedback_model.encode([sentence1], convert_to_tensor=True)[0]  # Get tensor and flatten
    embeddings2 = feedback_model.encode([sentence2], convert_to_tensor=True)[0]  # Get tensor and flatten

    # Compute cosine similarity
    similarity = cosine_similarity(embeddings1, embeddings2)

    # Print similarity to inspect what's happening
    print(f"Refined Similarity between '{sentence1}' and '{sentence2}': {similarity}")

    # Return True if similarity is above the threshold, else False
    return similarity >= threshold

# Load a pre-trained NLI model (BART-large for zero-shot classification)
nli_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Function for AI feedback using NLI-based model
def ai_feedback_nli(sentence1, sentence2, threshold=0.5):
    result = nli_model(sentence1, candidate_labels=[sentence2])
    # Use the "score" field to determine similarity
    similarity = result['scores'][0]  # Similarity score between sentence1 and sentence2

    # Print similarity score to inspect what's happening
    print(f"NLI Similarity between '{sentence1}' and '{sentence2}': {similarity}")

    return similarity >= threshold

# Create sentence pairs for contrastive learning (using both feedback methods)
examples = []

# Generate feedback examples based on AI feedback system with both refined and NLI-based feedback
for sentence1, sentence2 in zip(sports_sentences[:-1], sports_sentences[1:]):
    if ai_feedback_refined(sentence1, sentence2):  # Use AI feedback with refined similarity threshold
        examples.append(InputExample(texts=[sentence1, sentence2]))
    elif ai_feedback_nli(sentence1, sentence2):  # Use AI feedback with NLI model
        examples.append(InputExample(texts=[sentence1, sentence2]))

# Check how many valid pairs passed AI feedback
print(f"Number of valid pairs after AI feedback: {len(examples)}")

# If no valid pairs are found, suggest augmenting the dataset or adjusting the threshold
if len(examples) == 0:
    print("No valid pairs after AI feedback. Consider adjusting the threshold or augmenting the dataset.")
else:
    # Initialize the training dataloader and loss
    train_dataloader = torch.utils.data.DataLoader(examples, batch_size=2, shuffle=True)
    train_loss = losses.MultipleNegativesRankingLoss(simcse_model)

    # Train the SimCSE model with AI feedback
    simcse_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

    # Evaluation function to measure embedding quality using cosine similarity
    def evaluate_embeddings(model, sentence_pairs):
        embeddings = model.encode([pair.texts for pair in sentence_pairs], convert_to_tensor=True)
        similarities = []
        for i in range(len(embeddings)-1):
            similarity = torch.nn.functional.cosine_similarity(embeddings[i], embeddings[i+1], dim=0)
            similarities.append(similarity.item())
        return similarities

    # Prepare sample evaluation data
    evaluation_data = [
        InputExample(texts=["The striker scored", "The goalkeeper made a save"]),
        InputExample(texts=["The game went to overtime", "It was a close game"]),
    ]

    # Evaluate the fine-tuned model (with AI feedback)
    simcse_similarities = evaluate_embeddings(simcse_model, evaluation_data)

    # Print results
    print("SimCSE Similarities (after AI feedback):", simcse_similarities)


Refined Similarity between 'The striker scored a stunning goal in the last minute.' and 'The quarterback threw an interception in the final quarter.': 0.2924024760723114
NLI Similarity between 'The striker scored a stunning goal in the last minute.' and 'The quarterback threw an interception in the final quarter.': 2.7014577426598407e-05
Refined Similarity between 'The quarterback threw an interception in the final quarter.' and 'A thrilling comeback saw the team winning by two points.': 0.293432354927063
NLI Similarity between 'The quarterback threw an interception in the final quarter.' and 'A thrilling comeback saw the team winning by two points.': 0.0037838383577764034
Refined Similarity between 'A thrilling comeback saw the team winning by two points.' and 'The team displayed great defense and controlled the pace of the game.': 0.478251576423645
NLI Similarity between 'A thrilling comeback saw the team winning by two points.' and 'The team displayed great defense and controlled th

In [12]:
# Assuming you have already collected a larger set of domain-specific sentences and feedback pairs
sports_sentences_expanded = [
    "The striker scored a stunning goal in the last minute.",
    "The quarterback threw an interception in the final quarter.",
    "A thrilling comeback saw the team winning by two points.",
    "The team displayed great defense and controlled the pace of the game.",
    "The goalkeeper made a brilliant save in the final seconds.",
    "The coach's strategy led to a major win.",
    "The running back rushed for over 100 yards in the game.",
    "The defensive line shut down the opposing team's offense."
    # Add more sentences relevant to your sports domain
]

# Use the feedback functions to generate valid training pairs
examples = []

for sentence1, sentence2 in zip(sports_sentences_expanded[:-1], sports_sentences_expanded[1:]):
    if ai_feedback_refined(sentence1, sentence2):  # Use AI feedback with refined similarity threshold
        examples.append(InputExample(texts=[sentence1, sentence2]))
    elif ai_feedback_nli(sentence1, sentence2):  # Use AI feedback with NLI model
        examples.append(InputExample(texts=[sentence1, sentence2]))

# Number of valid pairs after AI feedback
print(f"Number of valid pairs after AI feedback: {len(examples)}")

# If there are valid pairs, proceed with training
if len(examples) > 0:
    train_dataloader = torch.utils.data.DataLoader(examples, batch_size=2, shuffle=True)
    train_loss = losses.MultipleNegativesRankingLoss(simcse_model)

    # Train the SimCSE model with domain-specific data
    simcse_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

    print("Training complete!")
else:
    print("No valid pairs for training. Consider adjusting the threshold or dataset.")


Refined Similarity between 'The striker scored a stunning goal in the last minute.' and 'The quarterback threw an interception in the final quarter.': 0.2924024760723114
NLI Similarity between 'The striker scored a stunning goal in the last minute.' and 'The quarterback threw an interception in the final quarter.': 2.7014577426598407e-05
Refined Similarity between 'The quarterback threw an interception in the final quarter.' and 'A thrilling comeback saw the team winning by two points.': 0.293432354927063
NLI Similarity between 'The quarterback threw an interception in the final quarter.' and 'A thrilling comeback saw the team winning by two points.': 0.0037838383577764034
Refined Similarity between 'A thrilling comeback saw the team winning by two points.' and 'The team displayed great defense and controlled the pace of the game.': 0.478251576423645
NLI Similarity between 'A thrilling comeback saw the team winning by two points.' and 'The team displayed great defense and controlled th