In [3]:
from transformers import pipeline
import json
from typing import List, Dict
from collections import defaultdict

class DialogueBiasDetector:
    def __init__(self):
        # Initialize models for different types of bias detection
        
        # For general bias and toxicity detection
        self.general_classifier = pipeline(
            "text-classification",
            model="unitary/unbiased-toxic-roberta",
            return_all_scores=True
        )
        
        # For demographic and identity bias
        self.demographic_classifier = pipeline(
            "text-classification",
            model="microsoft/mdeberta-v3-base",  # Good general-purpose model
            return_all_scores=True
        )

    def analyze_text(self, text: str) -> Dict:
        """Analyze text for different types of bias"""
        results = {
            "general_bias": self._check_general_bias(text),
            "demographic_bias": self._check_demographic_bias(text),
            "overall_bias_score": 0.0,
            "flags": []
        }
        
        # Calculate overall bias score
        bias_scores = [
            results["general_bias"]["score"],
            results["demographic_bias"]["score"]
        ]
        results["overall_bias_score"] = sum(bias_scores) / len(bias_scores)
        
        # Add flags for concerning content
        if results["overall_bias_score"] > 0.5:
            results["flags"].append("High bias detected")
        if results["general_bias"]["score"] > 0.7:
            results["flags"].append("Significant general bias")
        if results["demographic_bias"]["score"] > 0.6:
            results["flags"].append("Significant demographic bias")
            
        return results

    def _check_general_bias(self, text: str) -> Dict:
        """Check for general bias and toxicity"""
        predictions = self.general_classifier(text)[0]
        max_score = max(pred['score'] for pred in predictions)
        return {
            "score": max_score,
            "type": "general",
            "explanation": "Bias detected" if max_score > 0.5 else "No significant bias"
        }

    def _check_demographic_bias(self, text: str) -> Dict:
        """Check for demographic and identity-related biases"""
        predictions = self.demographic_classifier(text)[0]
        max_score = max(pred['score'] for pred in predictions)
        return {
            "score": max_score,
            "type": "demographic",
            "explanation": "Demographic bias detected" if max_score > 0.5 else "No significant demographic bias"
        }

    def analyze_dialogue(self, dialogue: Dict) -> Dict:
        """Analyze a complete dialogue for biases"""
        results = {
            "dialogue_id": dialogue.get("dialogue_id", "unknown"),
            "biases_detected": [],
            "overall_score": 0.0
        }
        
        # Analyze scenario if present
        if "generated_scenario" in dialogue:
            scenario_analysis = self.analyze_text(dialogue["generated_scenario"])
            if scenario_analysis["overall_bias_score"] > 0.3:
                results["biases_detected"].append({
                    "type": "scenario",
                    "analysis": scenario_analysis
                })

        # Analyze each turn
        for turn in dialogue.get("turns", []):
            # Analyze user utterance
            if "utterance" in turn:
                utterance_analysis = self.analyze_text(turn["utterance"])
                if utterance_analysis["overall_bias_score"] > 0.3:
                    results["biases_detected"].append({
                        "type": "user_utterance",
                        "turn_number": turn.get("turn_number"),
                        "analysis": utterance_analysis
                    })
            
            # Analyze assistant response
            if "assistant_response" in turn:
                response_analysis = self.analyze_text(turn["assistant_response"])
                if response_analysis["overall_bias_score"] > 0.3:
                    results["biases_detected"].append({
                        "type": "assistant_response",
                        "turn_number": turn.get("turn_number"),
                        "analysis": response_analysis
                    })

        # Calculate overall dialogue score
        if results["biases_detected"]:
            scores = [b["analysis"]["overall_bias_score"] for b in results["biases_detected"]]
            results["overall_score"] = sum(scores) / len(scores)
        
        return results

def analyze_dataset(file_path: str) -> List[Dict]:
    """Analyze entire dataset for biases"""
    detector = DialogueBiasDetector()
    results = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        dialogues = json.load(f)
    
    for dialogue in dialogues:
        results.append(detector.analyze_dialogue(dialogue))
    
    return results

# Example usage
if __name__ == "__main__":
    results = analyze_dataset("generated_dialogues.json")
    
    # Print summary
    for result in results:
        if result["biases_detected"]:
            print(f"\nDialogue ID: {result['dialogue_id']}")
            print("Detected biases:")
            for bias in result["biases_detected"]:
                print(f"- Type: {bias['type']}")
                print(f"  Score: {bias['analysis']['overall_bias_score']:.2f}")
                print(f"  Flags: {', '.join(bias['analysis']['flags'])}")

KeyboardInterrupt: 

In [4]:
from transformers import pipeline
import json
from typing import List, Dict
from collections import defaultdict
from tqdm import tqdm
import torch

class DialogueBiasDetector:
    def __init__(self, device=None):
        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = device
            
        print(f"Using device: {self.device}")
        print("Loading models...")
        
        # For general bias and toxicity detection
        self.general_classifier = pipeline(
            "text-classification",
            model="unitary/unbiased-toxic-roberta",
            return_all_scores=True,
            device=self.device
        )
        
        # For demographic and identity bias
        self.demographic_classifier = pipeline(
            "text-classification",
            model="microsoft/mdeberta-v3-base",
            return_all_scores=True,
            device=self.device
        )
        print("Models loaded successfully!")

    def analyze_text(self, text: str) -> Dict:
        """Analyze text for different types of bias"""
        results = {
            "general_bias": self._check_general_bias(text),
            "demographic_bias": self._check_demographic_bias(text),
            "overall_bias_score": 0.0,
            "flags": []
        }
        
        # Calculate overall bias score
        bias_scores = [
            results["general_bias"]["score"],
            results["demographic_bias"]["score"]
        ]
        results["overall_bias_score"] = sum(bias_scores) / len(bias_scores)
        
        # Add flags for concerning content
        if results["overall_bias_score"] > 0.5:
            results["flags"].append("High bias detected")
        if results["general_bias"]["score"] > 0.7:
            results["flags"].append("Significant general bias")
        if results["demographic_bias"]["score"] > 0.6:
            results["flags"].append("Significant demographic bias")
            
        return results

    def _check_general_bias(self, text: str) -> Dict:
        """Check for general bias and toxicity"""
        try:
            predictions = self.general_classifier(text)[0]
            max_score = max(pred['score'] for pred in predictions)
            return {
                "score": max_score,
                "type": "general",
                "explanation": "Bias detected" if max_score > 0.5 else "No significant bias"
            }
        except Exception as e:
            print(f"Error in general bias check: {e}")
            return {"score": 0.0, "type": "general", "explanation": "Error in processing"}

    def _check_demographic_bias(self, text: str) -> Dict:
        """Check for demographic and identity-related biases"""
        try:
            predictions = self.demographic_classifier(text)[0]
            max_score = max(pred['score'] for pred in predictions)
            return {
                "score": max_score,
                "type": "demographic",
                "explanation": "Demographic bias detected" if max_score > 0.5 else "No significant demographic bias"
            }
        except Exception as e:
            print(f"Error in demographic bias check: {e}")
            return {"score": 0.0, "type": "demographic", "explanation": "Error in processing"}

    def analyze_dialogue(self, dialogue: Dict) -> Dict:
        """Analyze a complete dialogue for biases with detailed progress tracking"""
        results = {
            "dialogue_id": dialogue.get("dialogue_id", "unknown"),
            "biases_detected": [],
            "overall_score": 0.0
        }
        
        # Analyze scenario if present
        if "generated_scenario" in dialogue:
            scenario_analysis = self.analyze_text(dialogue["generated_scenario"])
            if scenario_analysis["overall_bias_score"] > 0.3:
                results["biases_detected"].append({
                    "type": "scenario",
                    "analysis": scenario_analysis
                })

        # Analyze each turn with progress tracking
        turns = dialogue.get("turns", [])
        for turn in tqdm(turns, 
                        desc=f"Analyzing turns for dialogue {dialogue.get('dialogue_id')}",
                        leave=False):
            # Analyze user utterance
            if "utterance" in turn:
                utterance_analysis = self.analyze_text(turn["utterance"])
                if utterance_analysis["overall_bias_score"] > 0.3:
                    results["biases_detected"].append({
                        "type": "user_utterance",
                        "turn_number": turn.get("turn_number"),
                        "analysis": utterance_analysis
                    })
            
            # Analyze assistant response
            if "assistant_response" in turn:
                response_analysis = self.analyze_text(turn["assistant_response"])
                if response_analysis["overall_bias_score"] > 0.3:
                    results["biases_detected"].append({
                        "type": "assistant_response",
                        "turn_number": turn.get("turn_number"),
                        "analysis": response_analysis
                    })

        # Calculate overall dialogue score
        if results["biases_detected"]:
            scores = [b["analysis"]["overall_bias_score"] for b in results["biases_detected"]]
            results["overall_score"] = sum(scores) / len(scores)
        
        return results

    def analyze_batch(self, texts: List[str], batch_size: int = 16) -> List[Dict]:
        """Process multiple texts in batches with progress tracking"""
        results = []
        
        # Calculate number of batches
        num_batches = (len(texts) + batch_size - 1) // batch_size
        
        for i in tqdm(range(0, len(texts), batch_size), 
                     desc="Processing batches", 
                     total=num_batches):
            batch = texts[i:i + batch_size]
            batch_results = []
            
            # Process each text in the batch
            for text in batch:
                batch_results.append(self.analyze_text(text))
            
            results.extend(batch_results)
        
        return results

def analyze_dataset(file_path: str, output_path: str = None) -> List[Dict]:
    """Analyze entire dataset for biases with progress tracking and optional saving"""
    detector = DialogueBiasDetector()
    results = []
    
    print(f"\nLoading dataset from {file_path}...")
    with open(file_path, 'r', encoding='utf-8') as f:
        dialogues = json.load(f)
    
    print(f"\nAnalyzing {len(dialogues)} dialogues...")
    for dialogue in tqdm(dialogues, desc="Analyzing dialogues", unit="dialogue"):
        results.append(detector.analyze_dialogue(dialogue))
    
    # Save results if output path is provided
    if output_path:
        print(f"\nSaving results to {output_path}...")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
    
    return results

def print_analysis_summary(results: List[Dict]):
    """Print a detailed summary of the bias analysis"""
    print("\nAnalysis Summary:")
    print("-" * 50)
    
    # Overall statistics
    total_dialogues = len(results)
    dialogues_with_bias = sum(1 for r in results if r["biases_detected"])
    
    print(f"Total dialogues analyzed: {total_dialogues}")
    print(f"Dialogues with detected bias: {dialogues_with_bias}")
    print(f"Percentage with bias: {(dialogues_with_bias/total_dialogues)*100:.2f}%")
    
    # Detailed bias breakdown
    bias_types = defaultdict(int)
    for result in results:
        for bias in result["biases_detected"]:
            bias_types[bias["type"]] += 1
    
    print("\nBias Type Breakdown:")
    for bias_type, count in bias_types.items():
        print(f"- {bias_type}: {count} instances")
    
    # High bias cases
    high_bias_cases = [r for r in results if any(b["analysis"]["overall_bias_score"] > 0.7 
                                                for b in r["biases_detected"])]
    if high_bias_cases:
        print("\nHigh Bias Cases (score > 0.7):")
        for case in high_bias_cases[:5]:  # Show first 5 cases
            print(f"\nDialogue ID: {case['dialogue_id']}")
            for bias in case["biases_detected"]:
                if bias["analysis"]["overall_bias_score"] > 0.7:
                    print(f"- Type: {bias['type']}")
                    print(f"  Score: {bias['analysis']['overall_bias_score']:.2f}")
                    print(f"  Flags: {', '.join(bias['analysis']['flags'])}")

# Example usage
if __name__ == "__main__":
    # Analyze dataset and save results
    results = analyze_dataset(
        "generated_dialogues.json",
        output_path="bias_analysis_results.json"
    )
    
    # Print detailed summary
    print_analysis_summary(results)

Using device: cuda
Loading models...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']