In [None]:
import pandas as pd
import numpy as np
import os
import openai
from openai import OpenAI
import torch
import csv
import time
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from typing import List, Dict, Tuple, Optional, Union
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [None]:
"""def set_openai_api_key(api_key: str) -> None:
   def get_openai_api_key() -> str:

    api_key = ""
    config.openai_api_key = api_key
    openai.api_key = api_key
    print("API key set successfully.")

"""

def get_openai_api_key(api_key: str) -> None:
    client = OpenAI(api_key="Write Your OpenAI Key -> Here")
    #response = client.chat.completions.create(...)
    print("Client = API key set successfully.")


In [41]:
class Config:
    def __init__(self):
        self.openai_api_key = ""  # Will be set by user
        self.model_name = "gpt-3.5-turbo"
        self.embedding_model = "all-MiniLM-L6-v2"
        self.temperature = 0.2
        self.max_tokens = 150
        self.top_k = 5  # Number of most similar entries to consider
        self.similarity_threshold = 0.6
        self.csv_path = "data.csv"
        self.question_col = "question"
        self.answer_col = "answer"
        self.log_file = f"qa_logs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        self.test_size = 0.2
        self.few_shot_examples = 3  # Number of examples to include in prompt

config = Config()

In [None]:
def load_csv_data(file_path: str = None) -> pd.DataFrame:
    file_path = input("/content/Firat-Uni-Fen-Bilimler-Enstitu-Soru-ve-Cevap-512-Rows.csv") or config.csv_path
    if file_path is None:
        file_path = config.csv_path

    df = pd.read_csv(file_path)
    print(f"Loaded data with {len(df)} rows and {len(df.columns)} columns.")
    print(f"Columns: {df.columns.tolist()}")
    return df

def preprocess_data(df: pd.DataFrame, question_col: str = None, answer_col: str = None) -> pd.DataFrame:
    """Preprocess the CSV data for Q&A tasks."""
    # Allow the user to select columns if not specified
    if question_col is None or answer_col is None:
        print("Available columns:")
        for i, col in enumerate(df.columns):
            print(f"{i}: {col}")

        if question_col is None:
            q_idx = int(input("Enter the index of the question column: "))
            question_col = df.columns[q_idx]
            config.question_col = question_col

        if answer_col is None:
            a_idx = int(input("Enter the index of the answer column: "))
            answer_col = df.columns[a_idx]
            config.answer_col = answer_col

    # Drop rows with NaN values in either column
    df = df[[question_col, answer_col]].dropna()

    # Convert to string type
    df[question_col] = df[question_col].astype(str)
    df[answer_col] = df[answer_col].astype(str)

    # Remove duplicates
    df = df.drop_duplicates(subset=[question_col])

    print(f"After preprocessing: {len(df)} rows")
    return df

In [None]:
class EmbeddingService:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the embedding service with a pretrained model."""
        self.model = SentenceTransformer(model_name)
        self.cache = {}

    def get_embedding(self, text: str) -> np.ndarray:
        """Get embedding for a text, using cache if available."""
        if text in self.cache:
            return self.cache[text]

        embedding = self.model.encode(text)
        self.cache[text] = embedding
        return embedding

    def batch_embed(self, texts: List[str], batch_size: int = 32) -> List[np.ndarray]:
        """Generate embeddings for a list of texts in batches."""
        embeddings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch = texts[i:i+batch_size]

            # Check which texts are not in cache
            new_texts = [text for text in batch if text not in self.cache]
            new_embeddings = self.model.encode(new_texts) if new_texts else []

            # Add new embeddings to cache
            for text, emb in zip(new_texts, new_embeddings):
                self.cache[text] = emb

            # Get all embeddings for the batch (from cache now)
            batch_embeddings = [self.get_embedding(text) for text in batch]
            embeddings.extend(batch_embeddings)

        return embeddings

    def find_similar(self, query: str, corpus: List[str], corpus_embeddings: List[np.ndarray]) -> List[Tuple[int, float]]:
        """Find most similar corpus items to the query."""
        query_embedding = self.get_embedding(query)

        # Calculate cosine similarities
        similarities = []
        for i, emb in enumerate(corpus_embeddings):
            sim = cosine_similarity([query_embedding], [emb])[0][0]
            similarities.append((i, sim))

        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Filter by threshold
        filtered = [(i, sim) for i, sim in similarities if sim >= config.similarity_threshold]

        # Return top k results
        return filtered[:config.top_k]

In [None]:
class OpenAIService:
    def __init__(self, api_key: str, model_name: str = "gpt-3.5-turbo"):
        """Initialize the OpenAI service with API key and model name."""
        client = OpenAI(api_key="Write Your OpenAI Key -> Here")
        self.model = model_name
        self.client = openai.OpenAI(api_key=api_key)

    def call_api(self, messages: List[Dict], temperature: float = 0.2, max_tokens: int = 150) -> str:
        """Call OpenAI API with proper error handling and rate limiting."""
        retry_count = 0
        max_retries = 3

        while retry_count < max_retries:
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    temperature=temperature,
                    max_tokens=max_tokens
                )
                return response.choices[0].message.content.strip()
            except openai.RateLimitError:
                wait_time = 2 ** retry_count
                print(f"Rate limit exceeded. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                retry_count += 1
            except Exception as e:
                print(f"Error calling OpenAI API: {e}")
                time.sleep(1)
                retry_count += 1

        return "I'm sorry, but I couldn't generate a response due to API errors."

    def generate_answer(self, question: str, context: List[Dict[str, str]], few_shot_examples: List[Dict[str, str]] = None) -> str:
        """Generate an answer based on the question and retrieved context."""
        # Prepare context string
        context_str = "\n\n".join([f"Question: {item['question']}\nAnswer: {item['answer']}" for item in context])

        # Prepare few-shot examples if provided
        few_shot_str = ""
        if few_shot_examples:
            few_shot_str = "\n\n".join([f"Question: {ex['question']}\nAnswer: {ex['answer']}" for ex in few_shot_examples])
            few_shot_str = "Here are some examples of how to answer questions:\n\n" + few_shot_str + "\n\n"

        # Construct messages
        messages = [
            {"role": "system", "content": f"You are an assistant that answers questions based on the provided context. Use the context information to formulate accurate, concise answers. If the context doesn't contain relevant information, say 'I don't have enough information to answer this question.'"},
            {"role": "user", "content": f"{few_shot_str}Context information:\n{context_str}\n\nQuestion: {question}\n\nAnswer:"}
        ]

        return self.call_api(messages, temperature=config.temperature)

In [59]:
class TransferLearningQA:
    def __init__(self, df: pd.DataFrame = None):
        """Initialize the QA system with data and services."""
        # Load data if not provided
        if df is None:
            df = load_csv_data(config.csv_path)
            df = preprocess_data(df, config.question_col, config.answer_col)

        self.df = df
        self.question_col = config.question_col
        self.answer_col = config.answer_col

        # Initialize services
        self.embedding_service = EmbeddingService(config.embedding_model)
        self.openai_service = None  # Will be initialized when API key is provided

        # Generate embeddings for all questions
        print("Generating embeddings for all questions...")
        self.questions = df[self.question_col].tolist()
        self.answers = df[self.answer_col].tolist()
        self.question_embeddings = self.embedding_service.batch_embed(self.questions)

        # Prepare train/test split for evaluation
        self._prepare_train_test()

        # Performance tracking
        self.query_log = []

    def _prepare_train_test(self):
        """Split data into train and test sets for evaluation."""
        indices = list(range(len(self.df)))
        train_indices, test_indices = train_test_split(indices, test_size=config.test_size, random_state=42)

        self.train_indices = train_indices
        self.test_indices = test_indices

        print(f"Data split: {len(train_indices)} training examples, {len(test_indices)} test examples")

    def initialize_openai(self, api_key: str = None):
        """Initialize the OpenAI service with the provided API key."""
        if api_key is None:
            api_key = config.openai_api_key
            if not api_key:
                api_key = input("Please enter your OpenAI API key: ")
                config.openai_api_key = api_key

        self.openai_service = OpenAIService(api_key, config.model_name)
        print(f"OpenAI service initialized with model: {config.model_name}")

    def get_few_shot_examples(self) -> List[Dict[str, str]]:
        """Get a few representative examples for few-shot learning."""
        # Select random examples from the training set
        sample_indices = np.random.choice(self.train_indices, min(config.few_shot_examples, len(self.train_indices)), replace=False)

        examples = []
        for idx in sample_indices:
            examples.append({
                "question": self.questions[idx],
                "answer": self.answers[idx]
            })

        return examples

    def answer_question(self, question: str, use_few_shot: bool = True) -> Dict:
        """Answer a question using the QA pipeline."""
        # Initialize OpenAI service if not done already
        if self.openai_service is None:
            self.initialize_openai()

        start_time = time.time()

        # Find similar questions
        similar_indices = self.embedding_service.find_similar(
            question,
            self.questions,
            self.question_embeddings
        )

        # Get few-shot examples if enabled
        few_shot_examples = self.get_few_shot_examples() if use_few_shot else None

        # Prepare context for the model
        context = []
        for idx, similarity in similar_indices:
            context.append({
                "question": self.questions[idx],
                "answer": self.answers[idx],
                "similarity": similarity
            })

        # Generate answer
        answer = self.openai_service.generate_answer(
            question,
            context,
            few_shot_examples
        )

        elapsed_time = time.time() - start_time

        # Log this query
        log_entry = {
            "question": question,
            "answer": answer,
            "context": context,
            "time": elapsed_time,
            "timestamp": datetime.now().isoformat()
        }
        self.query_log.append(log_entry)

        # Log to file
        with open(config.log_file, 'a') as f:
            f.write(f"Q: {question}\n")
            f.write(f"A: {answer}\n")
            f.write(f"Time: {elapsed_time:.2f}s\n")
            f.write("-" * 50 + "\n")

        return {
            "question": question,
            "answer": answer,
            "context": context,
            "time": elapsed_time
        }

In [None]:
    def evaluate(self, num_samples: int = None) -> Dict:
        """Evaluate the QA system on test data."""
        if self.openai_service is None:
            self.initialize_openai()

        if num_samples is None:
            num_samples = min(len(self.test_indices), 20)  # Default to 20 samples

        # Sample questions from test set
        sample_indices = np.random.choice(self.test_indices, num_samples, replace=False)

        results = []
        for idx in tqdm(sample_indices, desc="Evaluating"):
            question = self.questions[idx]
            ground_truth = self.answers[idx]

            result = self.answer_question(question)
            result["ground_truth"] = ground_truth
            results.append(result)

        # Calculate basic metrics
        times = [r["time"] for r in results]
        avg_time = sum(times) / len(times)

        return {
            "num_samples": num_samples,
            "average_time": avg_time,
            "details": results
        }

    def show_statistics(self):
        """Display statistics about the QA system."""
        # Count queries
        query_count = len(self.query_log)

        if query_count == 0:
            print("No queries have been processed yet.")
            return

        # Calculate timing statistics
        times = [log["time"] for log in self.query_log]
        avg_time = sum(times) / len(times)
        max_time = max(times)
        min_time = min(times)

        # Get context info
        context_sizes = [len(log["context"]) for log in self.query_log]
        avg_context_size = sum(context_sizes) / len(context_sizes)

        # Print statistics
        print(f"Total queries: {query_count}")
        print(f"Average processing time: {avg_time:.2f}s")
        print(f"Min processing time: {min_time:.2f}s")
        print(f"Max processing time: {max_time:.2f}s")
        print(f"Average context size: {avg_context_size:.1f} entries")

        # Plot time distribution
        plt.figure(figsize=(10, 6))
        sns.histplot(times, bins=10)
        plt.title("Query Processing Time Distribution")
        plt.xlabel("Time (seconds)")
        plt.ylabel("Frequency")
        plt.show()

In [None]:
def run_qa_system():
    """Run the transfer learning QA system with an existing CSV file."""
    print("Transfer Learning QA System Initialization")
    print("------------------------------------------")

    # Set API key
    api_key = input("Write Your OpenAI Key -> Here")
    set_openai_api_key(api_key)

    # Load data from existing CSV
    file_path = input("/content/Firat-Uni-Fen-Bilimler-Enstitu-Soru-ve-Cevap-512-Rows.csv")
    df = load_csv_data(file_path)
    df = preprocess_data(df)

    # Initialize QA system
    print("\nInitializing QA system...")
    qa_system = TransferLearningQA(df)
    qa_system.initialize_openai(api_key)

    # Interactive Q&A loop
    print("\nQA System ready! Type 'quit' to exit, 'stats' to see statistics, or 'eval' to run evaluation.")

    while True:
        user_input = input("\nEnter your question: ")

        if user_input.lower() == 'quit':
            break
        elif user_input.lower() == 'stats':
            qa_system.show_statistics()
        elif user_input.lower() == 'eval':
            num_samples = int(input("How many samples to evaluate? (default: 10) ") or 10)
            results = qa_system.evaluate(num_samples)
            print(f"Evaluation complete. Average processing time: {results['average_time']:.2f}s")
        else:
            result = qa_system.answer_question(user_input)
            print(f"\nAnswer: {result['answer']}")
            print(f"\nProcessing time: {result['time']:.2f}s")

            # Show context info if requested
            show_context = input("Show retrieval context? (y/n): ")
            if show_context.lower() == 'y':
                print("\nRetrieved context:")
                for i, ctx in enumerate(result['context']):
                    print(f"{i+1}. Q: {ctx['question']}")
                    print(f"   A: {ctx['answer']}")
                    print(f"   Similarity: {ctx['similarity']:.4f}")

    print("Thank you for using the Transfer Learning QA System!")

# ======================================================
# 10. MAIN EXECUTION
# ======================================================
if __name__ == "__main__":
    run_qa_system()