#### Date 30th july 2025 Project Rag for AMLGO LABS

In [3]:
import sys
import os

# Add parent directory to sys.path so `src` can be imported
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)


In [4]:
import sys
sys.path.append('../src')
import json
import pandas as pd
from src.pipeline import RAGPipeline
import time

print(" Setting up comprehensive testing and evaluation...")

# Load pipeline
pipeline = RAGPipeline(
    index_path="../vectordb/document_index.faiss",
    metadata_path="../vectordb/metadata.json"
)

# Define comprehensive test queries
test_queries = [
    "What are eBay's return policies for buyers?",
    "How much fees does eBay charge sellers?", 
    "Can I cancel my order after purchasing?",
    "What happens if I violate eBay's user agreement?",
    "How does eBay handle disputes between buyers and sellers?",
    "What information does eBay collect from users?",
    "Are there any restrictions on what I can sell?",
    "What is eBay's policy on intellectual property?",
    "How does eBay's arbitration process work?",
    "What are the payment terms for sellers?"
]

print(f" Prepared {len(test_queries)} test queries")

 Setting up comprehensive testing and evaluation...
Initializing RAG pipeline...
RAG pipeline initialized!
 Prepared 10 test queries


In [5]:
# Test all queries and collect results
test_results = []

print(" Running comprehensive tests...")
print("=" * 60)

for i, query in enumerate(test_queries, 1):
    print(f"\\nTest {i}/{len(test_queries)}: {query}")
    print("-" * 40)
    
    start_time = time.time()
    
    try:
        response, sources = pipeline.query(query, top_k=3)
        end_time = time.time()
        
        # Evaluate response
        result = {
            'query_id': i,
            'query': query,
            'response': response,
            'sources': sources,
            'response_time': end_time - start_time,
            'response_length': len(response),
            'response_words': len(response.split()),
            'num_sources': len(sources),
            'success': True,
            'grounded': len(sources) > 0,
            'avg_source_length': sum(len(s) for s in sources) / len(sources) if sources else 0
        }
        
        print(f"Response ({result['response_time']:.2f}s): {response[:150]}...")
        print(f"Stats: {result['response_words']} words, {result['num_sources']} sources")
        
    except Exception as e:
        result = {
            'query_id': i,
            'query': query,
            'response': f"ERROR: {str(e)}",
            'sources': [],
            'response_time': 0,
            'response_length': 0,
            'response_words': 0,
            'num_sources': 0,
            'success': False,
            'grounded': False,
            'avg_source_length': 0
        }
        print(f" Error: {str(e)}")
    
    test_results.append(result)

print("\\n All tests completed!")

  return forward_call(*args, **kwargs)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 Running comprehensive tests...
\nTest 1/10: What are eBay's return policies for buyers?
----------------------------------------
Response (0.06s): Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
Stats: 10 words, 3 sources
\nTest 2/10: How much fees does eBay charge sellers?
----------------------------------------
Response (0.04s): Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
Stats: 10 words, 3 sources
\nTest 3/10: Can I cancel my order after purchasing?
----------------------------------------
Response (0.03s): Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
Stats: 10 words, 3 sources
\nTest 4/10: What happens if I violate eBay's user agreement?
----------------------------------------
Response (0.01s): Error generating response: PreTrainedTokenizerFast._batch_encod

In [6]:
# Create comprehensive analysis
df_results = pd.DataFrame(test_results)

# Success statistics
success_rate = df_results['success'].mean() * 100
avg_response_time = df_results[df_results['success']]['response_time'].mean()
avg_response_length = df_results[df_results['success']]['response_words'].mean()

print(" COMPREHENSIVE TEST RESULTS")
print("=" * 50)

print(f"\\nOverall Performance:")
print(f"Success Rate: {success_rate:.1f}%")
print(f"Average Response Time: {avg_response_time:.2f} seconds")
print(f"Average Response Length: {avg_response_length:.1f} words")

print(f"\\nSource Retrieval:")
print(f"Average Sources per Query: {df_results['num_sources'].mean():.1f}")
print(f"Grounded Responses: {df_results['grounded'].sum()}/{len(df_results)}")

print(f"\\nResponse Statistics:")
successful_responses = df_results[df_results['success']]
if len(successful_responses) > 0:
    print(f"Response Length Range: {successful_responses['response_words'].min()}-{successful_responses['response_words'].max()} words")
    print(f"Response Time Range: {successful_responses['response_time'].min():.2f}-{successful_responses['response_time'].max():.2f} seconds")

# Display detailed results
print(f"\\nDETAILED RESULTS:")
print("=" * 50)

for i, result in enumerate(test_results, 1):
    status = "SUCCESS" if result['success'] else "FAILED"
    print(f"\\n{i}. {status}")
    print(f"Query: {result['query']}")
    print(f"Response: {result['response'][:200]}{'...' if len(result['response']) > 200 else ''}")
    if result['success']:
        print(f"Time: {result['response_time']:.2f}s | Words: {result['response_words']} | Sources: {result['num_sources']}")

 COMPREHENSIVE TEST RESULTS
\nOverall Performance:
Success Rate: 100.0%
Average Response Time: 0.02 seconds
Average Response Length: 10.0 words
\nSource Retrieval:
Average Sources per Query: 3.0
Grounded Responses: 10/10
\nResponse Statistics:
Response Length Range: 10-10 words
Response Time Range: 0.01-0.06 seconds
\nDETAILED RESULTS:
\n1. SUCCESS
Query: What are eBay's return policies for buyers?
Response: Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'
Time: 0.06s | Words: 10 | Sources: 3
\n2. SUCCESS
Query: How much fees does eBay charge sellers?
Response: Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'
Time: 0.04s | Words: 10 | Sources: 3
\n3. SUCCESS
Query: Can I cancel my order after purchasing?
Response: Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'
Time: 0.03s | Words:

In [7]:
# Analyze success and failure patterns
successful_queries = df_results[df_results['success'] == True]
failed_queries = df_results[df_results['success'] == False]

print("SUCCESS CASES ANALYSIS")
print("=" * 40)

if len(successful_queries) > 0:
    # Best performing queries
    best_queries = successful_queries.nlargest(3, 'response_words')
    print("\\nMost Comprehensive Responses:")
    for _, row in best_queries.iterrows():
        print(f"\\n• Query: {row['query']}")
        print(f"  Response ({row['response_words']} words): {row['response'][:150]}...")
        print(f"  Sources: {row['num_sources']}")

print("\\nFAILURE CASES ANALYSIS")
print("=" * 40)

if len(failed_queries) > 0:
    print("\\nFailed Queries:")
    for _, row in failed_queries.iterrows():
        print(f"\\n• Query: {row['query']}")
        print(f"  Error: {row['response']}")
else:
    print("\\nNo failures detected!")

# Look for queries with poor responses (very short or generic)
short_responses = successful_queries[successful_queries['response_words'] < 10]
if len(short_responses) > 0:
    print("\nShort/Generic Responses (Potential Issues):")
    for _, row in short_responses.iterrows():
        print(f"\n• Query: {row['query']}")
        print(f"  Response ({row['response_words']} words): {row['response']}")
        print(f"  Sources: {row['num_sources']}")

SUCCESS CASES ANALYSIS
\nMost Comprehensive Responses:
\n• Query: What are eBay's return policies for buyers?
  Response (10 words): Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
  Sources: 3
\n• Query: How much fees does eBay charge sellers?
  Response (10 words): Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
  Sources: 3
\n• Query: Can I cancel my order after purchasing?
  Response (10 words): Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
  Sources: 3
\nFAILURE CASES ANALYSIS
\nNo failures detected!


In [8]:
# Save detailed test results
results_to_save = {
    'test_metadata': {
        'total_queries': len(test_queries),
        'success_rate': success_rate,
        'avg_response_time': avg_response_time,
        'avg_response_length': avg_response_length,
        'test_date': str(pd.Timestamp.now())
    },
    'detailed_results': test_results
}

with open('../test_results.json', 'w', encoding='utf-8') as f:
    json.dump(results_to_save, f, indent=2, ensure_ascii=False)

print("Test results saved to ../test_results.json")

# Generate summary for report
print("\nREPORT SUMMARY")
print("=" * 40)

print(f"""
## Testing Results Summary

### Performance Metrics
- **Total Test Queries**: {len(test_queries)}
- **Success Rate**: {success_rate:.1f}%
- **Average Response Time**: {avg_response_time:.2f} seconds
- **Average Response Length**: {avg_response_length:.1f} words
- **Average Sources Retrieved**: {df_results['num_sources'].mean():.1f}

### Example Successful Queries:
""")

# Show top 3 successful examples for report
top_examples = successful_queries.nlargest(3, 'response_words')
for i, (_, row) in enumerate(top_examples.iterrows(), 1):
    print(f"\n**Example {i}:**")
    print(f"- **Query**: {row['query']}")
    print(f"- **Response**: {row['response'][:200]}...")
    print(f"- **Sources Used**: {row['num_sources']}")
    print(f"- **Response Time**: {row['response_time']:.2f}s")

if len(failed_queries) > 0:
    print(f"\n### Failure Cases:")
    for _, row in failed_queries.iterrows():
        print(f"- **Query**: {row['query']}")
        print(f"- **Issue**: {row['response']}")

print("\nTesting and evaluation complete!")

Test results saved to ../test_results.json

REPORT SUMMARY

## Testing Results Summary

### Performance Metrics
- **Total Test Queries**: 10
- **Success Rate**: 100.0%
- **Average Response Time**: 0.02 seconds
- **Average Response Length**: 10.0 words
- **Average Sources Retrieved**: 3.0

### Example Successful Queries:


**Example 1:**
- **Query**: What are eBay's return policies for buyers?
- **Response**: Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
- **Sources Used**: 3
- **Response Time**: 0.06s

**Example 2:**
- **Query**: How much fees does eBay charge sellers?
- **Response**: Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an unexpected keyword argument 'truncate'...
- **Sources Used**: 3
- **Response Time**: 0.04s

**Example 3:**
- **Query**: Can I cancel my order after purchasing?
- **Response**: Error generating response: PreTrainedTokenizerFast._batch_encode_plus() got an