<a href="https://colab.research.google.com/github/ananya1331/SER-notebooks/blob/main/Call_Score_Demo_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook scores interview calls using only speaker diarization data (no transcripts).

It infers roles, computes conversation metrics, and produces a 0-100 call quality score.

In [19]:
# Install dependencies if needed (usually pre-installed in Colab)
# !pip install numpy

import json
import numpy as np
from collections import defaultdict
from typing import Dict, List, Tuple, Any

print("‚úì Libraries imported successfully")

‚úì Libraries imported successfully


In [20]:
def load_diarization(filepath: str) -> List[Dict[str, Any]]:
    """
    Load diarization from JSON or RTTM-like text file.

    Supports multiple formats:
    - JSON: {"segments": [...]} or direct list
    - RTTM: Space-separated text format

    Returns list of segments with speaker, start, end times
    """
    segments = []

    try:
        with open(filepath, 'r') as f:
            # Peek at first character to detect format
            first_char = f.read(1)
            f.seek(0)  # Reset file pointer

            # JSON format detection
            if first_char == '{' or first_char == '[':
                data = json.load(f)

                # Handle wrapped JSON ({"segments": [...]})
                if isinstance(data, dict):
                    data = data.get('segments', data.get('results', []))

                # Extract segments with flexible key names
                for item in data:
                    seg = {}
                    # Try multiple possible key names for speaker
                    seg['speaker'] = item.get('speaker', item.get('speaker_id', item.get('label', 'UNKNOWN')))
                    # Try multiple possible key names for start time
                    seg['start'] = float(item.get('start', item.get('start_time', item.get('begin', 0))))
                    # Try multiple possible key names for end time
                    seg['end'] = float(item.get('end', item.get('end_time', item.get('finish', seg['start']))))
                    segments.append(seg)

            # RTTM text format
            else:
                for line in f:
                    line = line.strip()
                    # Skip empty lines and comments
                    if not line or line.startswith('#'):
                        continue

                    parts = line.split()
                    # Ensure minimum required fields
                    if len(parts) >= 4:
                        seg = {
                            # RTTM format: SPEAKER file 1 start duration <NA> <NA> speaker_id
                            'speaker': parts[0] if not parts[0].startswith('SPEAKER') else parts[7] if len(parts) > 7 else parts[0],
                            'start': float(parts[3] if parts[0].startswith('SPEAKER') else parts[1]),
                            # Calculate end from start + duration
                            'end': float(parts[3]) + float(parts[4]) if parts[0].startswith('SPEAKER') and len(parts) > 4 else float(parts[2])
                        }
                        segments.append(seg)

    except Exception as e:
        print(f"Load error: {e}")
        return []

    # Filter invalid segments (end must be after start)
    segments = [s for s in segments if s['end'] > s['start']]
    # Sort chronologically
    segments.sort(key=lambda x: x['start'])

    return segments

# Test the loader
print("‚úì Data loading function defined")

‚úì Data loading function defined


In [21]:
def normalize_speakers(segments: List[Dict]) -> List[Dict]:
    """
    Map speaker IDs to standardized SPEAKER_0 and SPEAKER_1.

    If more than 2 speakers detected:
    - Keep the 2 speakers with most total speaking time
    - Discard others as noise/crosstalk
    """
    if not segments:
        return []

    # Get unique speaker IDs
    unique_speakers = sorted(set(s['speaker'] for s in segments))

    # Handle case with >2 speakers
    if len(unique_speakers) > 2:
        # Calculate total speaking time per speaker
        speaker_times = defaultdict(float)
        for s in segments:
            speaker_times[s['speaker']] += s['end'] - s['start']

        # Keep top 2 speakers by speaking time
        unique_speakers = sorted(
            speaker_times.keys(),
            key=lambda x: speaker_times[x],
            reverse=True
        )[:2]

    # Create mapping: original_id -> SPEAKER_0/SPEAKER_1
    speaker_map = {spk: f"SPEAKER_{i}" for i, spk in enumerate(unique_speakers)}

    # Apply mapping to all segments
    for seg in segments:
        seg['speaker'] = speaker_map.get(seg['speaker'], 'SPEAKER_UNKNOWN')

    # Remove segments from discarded speakers
    return [s for s in segments if s['speaker'] != 'SPEAKER_UNKNOWN']

print("‚úì Speaker normalization function defined")

‚úì Speaker normalization function defined


In [22]:
def infer_roles(segments: List[Dict]) -> Tuple[str, str]:
    """
    Infer who is interviewer vs candidate using heuristics:

    Heuristics:
    1. Interviewer usually speaks first (opens the call)
    2. Interviewer typically has more turn-taking (asks questions)

    Returns: (interviewer_id, candidate_id)
    """
    if not segments:
        return 'SPEAKER_0', 'SPEAKER_1'

    # Heuristic 1: First speaker is likely interviewer
    first_speaker = segments[0]['speaker']

    # Heuristic 2: Count turn transitions per speaker
    turn_counts = defaultdict(int)
    prev_speaker = None

    for seg in segments:
        # New turn detected when speaker changes
        if seg['speaker'] != prev_speaker:
            turn_counts[seg['speaker']] += 1
            prev_speaker = seg['speaker']

    speakers = ['SPEAKER_0', 'SPEAKER_1']

    # Edge case: only one speaker detected
    if len(turn_counts) < 2:
        return first_speaker, [s for s in speakers if s != first_speaker][0]

    # Speaker with more turns is likely interviewer (asks questions)
    more_turns_speaker = max(turn_counts, key=turn_counts.get)

    # If both heuristics agree, high confidence
    if first_speaker == more_turns_speaker:
        interviewer = first_speaker
    else:
        # Tie-breaker: trust turn count over first-speaker
        interviewer = more_turns_speaker

    # Assign remaining speaker as candidate
    candidate = [s for s in speakers if s != interviewer][0]

    return interviewer, candidate

print("‚úì Role inference function defined")

‚úì Role inference function defined


In [23]:
def compute_talk_ratio(segments: List[Dict], interviewer: str, candidate: str) -> Dict[str, float]:
    """
    Calculate speaking time distribution.

    Healthy range: Candidate speaks 40-60% of total time
    - Too low (<30%): Candidate not engaged
    - Too high (>70%): Candidate dominating/rambling
    """
    # Sum duration for each speaker
    interviewer_time = sum(s['end'] - s['start'] for s in segments if s['speaker'] == interviewer)
    candidate_time = sum(s['end'] - s['start'] for s in segments if s['speaker'] == candidate)

    total_time = interviewer_time + candidate_time

    # Avoid division by zero
    if total_time == 0:
        return {'interviewer_time': 0, 'candidate_time': 0, 'candidate_ratio': 0}

    return {
        'interviewer_time': round(interviewer_time, 2),
        'candidate_time': round(candidate_time, 2),
        'candidate_ratio': round(candidate_time / total_time, 3)  # 0.0 to 1.0
    }

print("‚úì Talk ratio function defined")

‚úì Talk ratio function defined


Interpretation:

candidate_ratio = 0.45 ‚Üí Candidate spoke 45% of time (good balance)

candidate_ratio = 0.20 ‚Üí Candidate spoke only 20% (red flag: disengaged)

candidate_ratio = 0.80 ‚Üí Candidate spoke 80% (yellow flag: over-talking)

In [24]:
def compute_turn_balance(segments: List[Dict], interviewer: str, candidate: str) -> Dict[str, Any]:
    """
    Calculate turn-taking balance.

    Healthy conversation: Balanced back-and-forth exchange
    Balance score: 1.0 = perfect balance, 0.0 = one-sided
    """
    turns = []
    prev_speaker = None

    # Extract turn sequence: [INTERVIEWER, CANDIDATE, INTERVIEWER, ...]
    for seg in segments:
        if seg['speaker'] != prev_speaker:
            turns.append(seg['speaker'])
            prev_speaker = seg['speaker']

    # Edge case: too few turns
    if len(turns) < 2:
        return {
            'total_turns': len(turns),
            'interviewer_turns': 0,
            'candidate_turns': 0,
            'balance_score': 0
        }

    interviewer_turns = turns.count(interviewer)
    candidate_turns = turns.count(candidate)
    total_turns = len(turns)

    # Calculate balance score
    # Perfect balance: interviewer_turns == candidate_turns
    # Formula: 1 - (absolute difference / total turns)
    balance_score = 1.0 - abs(interviewer_turns - candidate_turns) / total_turns

    return {
        'total_turns': total_turns,
        'interviewer_turns': interviewer_turns,
        'candidate_turns': candidate_turns,
        'balance_score': round(balance_score, 3)  # 0.0 to 1.0
    }

print("‚úì Turn balance function defined")

‚úì Turn balance function defined


In [25]:
def compute_response_latency(segments: List[Dict], interviewer: str, candidate: str) -> Dict[str, float]:
    """
    Calculate average time gap between interviewer finishing and candidate starting.

    Measures candidate's response speed:
    - <0.5s: Quick, engaged responses
    - 0.5-1.5s: Normal thinking time
    - >2s: Slow/hesitant (possible confusion or lack of preparation)
    """
    latencies = []

    # Find all interviewer ‚Üí candidate transitions
    for i in range(len(segments) - 1):
        if segments[i]['speaker'] == interviewer and segments[i + 1]['speaker'] == candidate:
            # Gap = candidate_start - interviewer_end
            gap = segments[i + 1]['start'] - segments[i]['end']

            # Only count positive gaps (negative = overlap/interruption)
            if gap >= 0:
                latencies.append(gap)

    if not latencies:
        return {'avg_latency': 0, 'latency_count': 0}

    return {
        'avg_latency': round(np.mean(latencies), 3),  # Average in seconds
        'latency_count': len(latencies)  # Number of transitions measured
    }

print("‚úì Response latency function defined")

‚úì Response latency function defined


In [26]:
def compute_overlaps(segments: List[Dict]) -> Dict[str, int]:
    """
    Count overlapping speech events.

    Overlap: Any simultaneous speech by both speakers
    Interruption: Overlap starting >0.5s before previous speaker finishes

    High overlap rate suggests:
    - Poor connection (audio delay)
    - Aggressive communication style
    - Excitement/engagement (context-dependent)
    """
    overlaps = 0
    interruptions = 0

    # Compare each segment with all following segments
    for i in range(len(segments) - 1):
        for j in range(i + 1, len(segments)):
            # Stop if segment j starts after segment i ends
            if segments[j]['start'] >= segments[i]['end']:
                break

            # Check for overlap between different speakers
            if segments[i]['speaker'] != segments[j]['speaker']:
                # Calculate overlap duration
                overlap_duration = min(segments[i]['end'], segments[j]['end']) - segments[j]['start']

                if overlap_duration > 0:
                    overlaps += 1

                    # Classify as interruption if overlap starts mid-speech
                    # Threshold: 0.5s before previous speaker finishes
                    if segments[j]['start'] < segments[i]['end'] - 0.5:
                        interruptions += 1

    return {
        'total_overlaps': overlaps,
        'interruptions': interruptions
    }

print("‚úì Overlap detection function defined")

‚úì Overlap detection function defined


In [27]:
def calculate_call_score(metrics: Dict[str, Any]) -> int:
    """
    Compute 0-100 call quality score using weighted metrics.

    Scoring logic:
    - Start at 100 points
    - Deduct points for concerning patterns
    - Add points for positive signals

    Final score interpretation:
    - 80-100: Excellent call quality
    - 60-79: Good, minor issues
    - 40-59: Fair, notable concerns
    - 0-39: Poor call quality
    """
    score = 100  # Start with perfect score

    # --- TALK RATIO PENALTIES ---
    talk_ratio = metrics['talk_ratio']['candidate_ratio']

    if talk_ratio < 0.3:
        score -= 20  # Severe: Candidate barely spoke
    elif talk_ratio < 0.4:
        score -= 10  # Moderate: Candidate under-engaged
    elif talk_ratio > 0.7:
        score -= 15  # Severe: Candidate dominating
    elif talk_ratio > 0.6:
        score -= 5   # Slight: Candidate talking a bit much

    # --- TURN BALANCE ADJUSTMENT ---
    balance = metrics['turn_balance']['balance_score']
    # Scale balance score from 0-1 to -10 to +10 points
    score += (balance - 0.5) * 20

    # --- RESPONSE LATENCY PENALTIES ---
    latency = metrics['response_latency']['avg_latency']

    if latency > 2.0:
        score -= 15  # Severe: Very slow responses
    elif latency > 1.5:
        score -= 8   # Moderate: Somewhat slow
    elif latency < 0.3:
        score += 5   # Bonus: Very quick, engaged responses

    # --- OVERLAP/INTERRUPTION PENALTIES ---
    overlaps = metrics['overlaps']['total_overlaps']
    interruptions = metrics['overlaps']['interruptions']
    total_turns = metrics['turn_balance']['total_turns']

    if total_turns > 0:
        # Calculate overlap rate relative to turn count
        overlap_rate = overlaps / total_turns

        if overlap_rate > 0.3:
            score -= 20  # Severe: Chaotic conversation
        elif overlap_rate > 0.2:
            score -= 10  # Moderate: Frequent overlaps

        # Additional penalty for interruptions
        if interruptions > total_turns * 0.15:
            score -= 10  # Too many interruptions

    # Clamp score to 0-100 range
    return max(0, min(100, int(score)))

print("‚úì Score calculation function defined")

‚úì Score calculation function defined


In [28]:
def score_call(diarization_path: str) -> Dict[str, Any]:
    """
    Complete end-to-end call scoring pipeline.

    Steps:
    1. Load diarization file
    2. Normalize speaker IDs
    3. Infer interviewer vs candidate roles
    4. Compute all metrics
    5. Calculate final score

    Returns JSON with score and detailed metrics
    """
    # Step 1: Load data
    segments = load_diarization(diarization_path)

    if not segments:
        return {
            'error': 'No valid segments found',
            'call_score': 0,
            'metrics': {}
        }

    # Step 2: Normalize speakers to SPEAKER_0 and SPEAKER_1
    segments = normalize_speakers(segments)

    if len(segments) < 2:
        return {
            'error': 'Insufficient segments for analysis',
            'call_score': 0,
            'metrics': {}
        }

    # Step 3: Infer roles
    interviewer, candidate = infer_roles(segments)

    # Step 4: Compute all metrics
    metrics = {
        'roles': {
            'interviewer': interviewer,
            'candidate': candidate
        },
        'talk_ratio': compute_talk_ratio(segments, interviewer, candidate),
        'turn_balance': compute_turn_balance(segments, interviewer, candidate),
        'response_latency': compute_response_latency(segments, interviewer, candidate),
        'overlaps': compute_overlaps(segments)
    }

    # Step 5: Calculate final score
    call_score = calculate_call_score(metrics)

    return {
        'call_score': call_score,
        'metrics': metrics,
        'segment_count': len(segments)
    }

print("‚úì Main pipeline function defined")

‚úì Main pipeline function defined


In [29]:
# Create a sample diarization file for testing
sample_data = {
    "segments": [
        {"speaker": "spk_0", "start": 0.0, "end": 3.5},
        {"speaker": "spk_1", "start": 3.8, "end": 8.2},
        {"speaker": "spk_0", "start": 8.5, "end": 12.1},
        {"speaker": "spk_1", "start": 12.4, "end": 18.7},
        {"speaker": "spk_0", "start": 19.0, "end": 22.3},
        {"speaker": "spk_1", "start": 22.5, "end": 30.2},
        {"speaker": "spk_0", "start": 30.8, "end": 35.1},
        {"speaker": "spk_1", "start": 35.3, "end": 42.6},
    ]
}

# Save to file
with open('diarization.json', 'w') as f:
    json.dump(sample_data, f, indent=2)

print("‚úì Sample diarization file created: diarization.json")

‚úì Sample diarization file created: diarization.json


In [30]:
# Score the call
diarization_file = 'diarization.json'
result = score_call(diarization_file)

# Display results
print("\n" + "="*60)
print("CALL SCORING RESULTS")
print("="*60)
print(json.dumps(result, indent=2))
print("="*60)


CALL SCORING RESULTS
{
  "call_score": 100,
  "metrics": {
    "roles": {
      "interviewer": "SPEAKER_0",
      "candidate": "SPEAKER_1"
    },
    "talk_ratio": {
      "interviewer_time": 14.7,
      "candidate_time": 25.7,
      "candidate_ratio": 0.636
    },
    "turn_balance": {
      "total_turns": 8,
      "interviewer_turns": 4,
      "candidate_turns": 4,
      "balance_score": 1.0
    },
    "response_latency": {
      "avg_latency": 0.25,
      "latency_count": 4
    },
    "overlaps": {
      "total_overlaps": 0,
      "interruptions": 0
    }
  },
  "segment_count": 8
}


In [31]:
# Extract and display individual metrics for easier interpretation
if 'metrics' in result and result['metrics']:
    print("\nüìä DETAILED METRICS BREAKDOWN\n")

    # Roles
    roles = result['metrics']['roles']
    print(f"üë• Identified Roles:")
    print(f"   Interviewer: {roles['interviewer']}")
    print(f"   Candidate: {roles['candidate']}\n")

    # Talk ratio
    talk = result['metrics']['talk_ratio']
    print(f"üó£Ô∏è  Talk Ratio:")
    print(f"   Interviewer time: {talk['interviewer_time']}s")
    print(f"   Candidate time: {talk['candidate_time']}s")
    print(f"   Candidate ratio: {talk['candidate_ratio']:.1%}\n")

    # Turn balance
    turns = result['metrics']['turn_balance']
    print(f"üîÑ Turn Balance:")
    print(f"   Total turns: {turns['total_turns']}")
    print(f"   Interviewer turns: {turns['interviewer_turns']}")
    print(f"   Candidate turns: {turns['candidate_turns']}")
    print(f"   Balance score: {turns['balance_score']:.2f}/1.00\n")

    # Response latency
    latency = result['metrics']['response_latency']
    print(f"‚è±Ô∏è  Response Latency:")
    print(f"   Average: {latency['avg_latency']:.2f}s")
    print(f"   Measured transitions: {latency['latency_count']}\n")

    # Overlaps
    overlap = result['metrics']['overlaps']
    print(f"üîÄ Overlaps:")
    print(f"   Total overlaps: {overlap['total_overlaps']}")
    print(f"   Interruptions: {overlap['interruptions']}\n")

    # Final score
    print(f"üéØ FINAL SCORE: {result['call_score']}/100")

    # Score interpretation
    score = result['call_score']
    if score >= 80:
        quality = "Excellent ‚úÖ"
    elif score >= 60:
        quality = "Good ‚úì"
    elif score >= 40:
        quality = "Fair ‚ö†Ô∏è"
    else:
        quality = "Poor ‚ùå"

    print(f"   Quality: {quality}")


üìä DETAILED METRICS BREAKDOWN

üë• Identified Roles:
   Interviewer: SPEAKER_0
   Candidate: SPEAKER_1

üó£Ô∏è  Talk Ratio:
   Interviewer time: 14.7s
   Candidate time: 25.7s
   Candidate ratio: 63.6%

üîÑ Turn Balance:
   Total turns: 8
   Interviewer turns: 4
   Candidate turns: 4
   Balance score: 1.00/1.00

‚è±Ô∏è  Response Latency:
   Average: 0.25s
   Measured transitions: 4

üîÄ Overlaps:
   Total overlaps: 0
   Interruptions: 0

üéØ FINAL SCORE: 100/100
   Quality: Excellent ‚úÖ
