# Atomus TAM Research - Scoring Analysis

This notebook provides tools for analyzing and optimizing the Atomus scoring algorithm.

## Features:
- Interactive scoring analysis
- Weight optimization
- Keyword effectiveness analysis
- Tier distribution analysis
- Comparative scoring scenarios

In [None]:
# Setup and imports
import sys
import os
from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import yaml
from collections import Counter

# Add src to path
sys.path.append('../src')

# Import our modules
from scoring_engine import AtomScoringEngine
from data_processing import AtomDataProcessor
from utils import get_logger

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("📊 Atomus Scoring Analysis")
print("=" * 30)
print("✅ All modules imported successfully")

## 🔧 Initialize Scoring Engine

In [None]:
# Initialize components
logger = get_logger()
scoring_engine = AtomScoringEngine()
data_processor = AtomDataProcessor()

print("🔧 Initializing scoring engine...")

# Get current configuration
config = scoring_engine.get_config_summary()

print(f"\n📏 Current Configuration:")
print(f"   Scoring Weights: {config['weights']}")
print(f"   Tier Thresholds: {config['tier_thresholds']}")
print(f"   Keyword Categories: {len(config['keywords'])} categories")

# Show keyword details
print(f"\n🔤 Keyword Distribution:")
for category, keywords in config['keywords'].items():
    print(f"   {category}: {len(keywords)} keywords")
    print(f"      Sample: {keywords[:3] if len(keywords) >= 3 else keywords}")

## 📋 Load Test Data

In [None]:
# Load test companies
print("📋 Loading test dataset...")

test_companies = data_processor.load_prospect_database()
df_companies = pd.DataFrame(test_companies)

print(f"✅ Loaded {len(df_companies)} companies")
print(f"📊 Columns: {list(df_companies.columns)}")

# Display companies
print("\n🏢 Test Companies:")
display(df_companies)

## 🧮 Interactive Scoring Analysis

In [None]:
# Score all test companies
print("🧮 Scoring all test companies...")

scoring_results = []

for _, company_row in df_companies.iterrows():
    company_data = company_row.to_dict()
    
    try:
        result = scoring_engine.calculate_company_score(company_data)
        
        # Add company name to result
        result['company_name'] = company_data['name']
        scoring_results.append(result)
        
        print(f"   ✅ {company_data['name']}: {result['total_score']:.1f} ({result['tier_classification']})")
        
    except Exception as e:
        print(f"   ❌ {company_data['name']}: Error - {str(e)}")

print(f"\n📊 Successfully scored {len(scoring_results)} companies")

In [None]:
# Create detailed results dataframe
if scoring_results:
    detailed_results = []
    
    for result in scoring_results:
        row = {
            'Company': result['company_name'],
            'Total_Score': result['total_score'],
            'Tier': result['tier_classification']
        }
        
        # Add component scores
        for component, score in result['component_scores'].items():
            row[f'{component}_Score'] = score
        
        # Add keyword counts
        for category, keywords in result['keywords_found'].items():
            row[f'{category}_Keywords'] = len(keywords) if keywords else 0
        
        detailed_results.append(row)
    
    df_results = pd.DataFrame(detailed_results)
    
    print("📊 Detailed Scoring Results:")
    display(df_results)
    
else:
    print("❌ No scoring results to analyze")

## 📈 Score Distribution Analysis

In [None]:
# Score distribution visualization
if scoring_results:
    scores = [r['total_score'] for r in scoring_results]
    company_names = [r['company_name'] for r in scoring_results]
    tiers = [r['tier_classification'] for r in scoring_results]
    
    # Create figure with subplots
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Score distribution histogram
    ax1.hist(scores, bins=10, alpha=0.7, edgecolor='black')
    ax1.set_xlabel('Total Score')
    ax1.set_ylabel('Number of Companies')
    ax1.set_title('Score Distribution')
    ax1.grid(True, alpha=0.3)
    
    # Add tier lines
    tier_thresholds = config['tier_thresholds']
    for tier, threshold in tier_thresholds.items():
        if tier != 'tier_4':  # Skip the lowest threshold
            ax1.axvline(threshold, color='red', linestyle='--', alpha=0.5, label=f'{tier}: {threshold}')
    ax1.legend()
    
    # 2. Company scores bar chart
    bars = ax2.bar(range(len(scores)), scores, color='skyblue', edgecolor='black')
    ax2.set_xlabel('Companies')
    ax2.set_ylabel('Total Score')
    ax2.set_title('Individual Company Scores')
    ax2.set_xticks(range(len(company_names)))
    ax2.set_xticklabels(company_names, rotation=45, ha='right')
    ax2.grid(True, alpha=0.3)
    
    # Color bars by tier
    tier_colors = {'tier_1': 'darkgreen', 'tier_2': 'green', 'tier_3': 'orange', 'tier_4': 'red', 'excluded': 'gray'}
    for i, (bar, tier) in enumerate(zip(bars, tiers)):
        bar.set_color(tier_colors.get(tier, 'skyblue'))
    
    # 3. Tier distribution pie chart
    tier_counts = Counter(tiers)
    ax3.pie(tier_counts.values(), labels=tier_counts.keys(), autopct='%1.1f%%', startangle=90)
    ax3.set_title('Tier Distribution')
    
    # 4. Component score comparison
    if 'df_results' in locals():
        component_cols = [col for col in df_results.columns if col.endswith('_Score') and col != 'Total_Score']
        if component_cols:
            component_means = df_results[component_cols].mean()
            ax4.bar(range(len(component_means)), component_means.values, color='lightcoral', edgecolor='black')
            ax4.set_xlabel('Score Components')
            ax4.set_ylabel('Average Score')
            ax4.set_title('Average Component Scores')
            ax4.set_xticks(range(len(component_means)))
            ax4.set_xticklabels([col.replace('_Score', '') for col in component_means.index], rotation=45, ha='right')
            ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"\n📊 Score Statistics:")
    print(f"   Mean: {np.mean(scores):.2f}")
    print(f"   Median: {np.median(scores):.2f}")
    print(f"   Std Dev: {np.std(scores):.2f}")
    print(f"   Min: {np.min(scores):.2f}")
    print(f"   Max: {np.max(scores):.2f}")
    
    print(f"\n🏆 Tier Breakdown:")
    for tier, count in tier_counts.items():
        percentage = (count / len(tiers)) * 100
        print(f"   {tier}: {count} companies ({percentage:.1f}%)")

## 🔤 Keyword Analysis

In [None]:
# Keyword effectiveness analysis
print("🔤 Keyword Effectiveness Analysis")
print("=" * 40)

if scoring_results:
    # Collect all keywords found
    all_keywords_found = {}
    keyword_company_mapping = {}
    
    for result in scoring_results:
        company = result['company_name']
        
        for category, keywords in result['keywords_found'].items():
            if category not in all_keywords_found:
                all_keywords_found[category] = []
                keyword_company_mapping[category] = {}
            
            for keyword in keywords:
                all_keywords_found[category].append(keyword)
                if keyword not in keyword_company_mapping[category]:
                    keyword_company_mapping[category][keyword] = []
                keyword_company_mapping[category][keyword].append(company)
    
    # Analyze keyword frequency
    print("📈 Keyword Frequency Analysis:")
    
    for category, keywords in all_keywords_found.items():
        if keywords:
            keyword_counts = Counter(keywords)
            print(f"\n   {category.upper()}:")
            for keyword, count in keyword_counts.most_common(5):
                companies = keyword_company_mapping[category][keyword]
                print(f"     '{keyword}': {count} companies ({companies})")
        else:
            print(f"\n   {category.upper()}: No keywords found")
    
    # Visualize keyword usage
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, (category, keywords) in enumerate(all_keywords_found.items()):
        if i < len(axes) and keywords:
            keyword_counts = Counter(keywords)
            top_keywords = keyword_counts.most_common(5)
            
            if top_keywords:
                words, counts = zip(*top_keywords)
                axes[i].bar(range(len(words)), counts, color='lightblue', edgecolor='black')
                axes[i].set_title(f'{category.title()} Keywords')
                axes[i].set_xticks(range(len(words)))
                axes[i].set_xticklabels(words, rotation=45, ha='right')
                axes[i].set_ylabel('Frequency')
                axes[i].grid(True, alpha=0.3)
    
    # Hide unused subplots
    for i in range(len(all_keywords_found), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("❌ No keyword data to analyze")

## ⚖️ Weight Sensitivity Analysis

In [None]:
# Weight sensitivity analysis
print("⚖️ Weight Sensitivity Analysis")
print("=" * 35)

if scoring_results and len(scoring_results) > 0:
    # Test different weight configurations
    weight_scenarios = {
        'Current': config['weights'],
        'Defense Heavy': {
            'defense_contract_score': 0.5,
            'technology_relevance': 0.2,
            'compliance_indicators': 0.2,
            'firmographics': 0.1
        },
        'Tech Heavy': {
            'defense_contract_score': 0.2,
            'technology_relevance': 0.5,
            'compliance_indicators': 0.2,
            'firmographics': 0.1
        },
        'Balanced': {
            'defense_contract_score': 0.25,
            'technology_relevance': 0.25,
            'compliance_indicators': 0.25,
            'firmographics': 0.25
        }
    }
    
    # Calculate scores for each scenario
    scenario_results = {}
    
    for scenario_name, weights in weight_scenarios.items():
        print(f"\n📊 Testing scenario: {scenario_name}")
        print(f"   Weights: {weights}")
        
        scenario_scores = []
        
        for result in scoring_results:
            # Recalculate weighted score
            component_scores = result['component_scores']
            
            weighted_score = sum(
                component_scores.get(component, 0) * weight
                for component, weight in weights.items()
            )
            
            scenario_scores.append(weighted_score)
        
        scenario_results[scenario_name] = scenario_scores
        
        # Print summary stats
        print(f"   Mean score: {np.mean(scenario_scores):.2f}")
        print(f"   Score range: {np.min(scenario_scores):.2f} - {np.max(scenario_scores):.2f}")
    
    # Visualize weight sensitivity
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Box plot of score distributions
    scenario_data = [scores for scores in scenario_results.values()]
    scenario_names = list(scenario_results.keys())
    
    ax1.boxplot(scenario_data, labels=scenario_names)
    ax1.set_title('Score Distributions by Weight Scenario')
    ax1.set_ylabel('Total Score')
    ax1.grid(True, alpha=0.3)
    ax1.tick_params(axis='x', rotation=45)
    
    # Line plot showing score changes for each company
    for i, company in enumerate(company_names):
        company_scores = [scenario_results[scenario][i] for scenario in scenario_names]
        ax2.plot(scenario_names, company_scores, marker='o', label=company, alpha=0.7)
    
    ax2.set_title('Company Score Changes Across Scenarios')
    ax2.set_ylabel('Total Score')
    ax2.set_xlabel('Weight Scenario')
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Show ranking changes
    print(f"\n🏆 Ranking Changes Across Scenarios:")
    ranking_df = pd.DataFrame()
    
    for scenario_name, scores in scenario_results.items():
        # Create ranking (1 = highest score)
        rankings = pd.Series(scores, index=company_names).rank(ascending=False, method='min')
        ranking_df[scenario_name] = rankings
    
    display(ranking_df.astype(int))
    
else:
    print("❌ No scoring results available for weight analysis")

## 🎯 Tier Optimization Analysis

In [None]:
# Tier threshold optimization
print("🎯 Tier Threshold Optimization")
print("=" * 35)

if scoring_results:
    scores = [r['total_score'] for r in scoring_results]
    current_thresholds = config['tier_thresholds']
    
    print(f"Current thresholds: {current_thresholds}")
    
    # Analyze current tier distribution
    def classify_tier(score, thresholds):
        if score >= thresholds['tier_1']:
            return 'tier_1'
        elif score >= thresholds['tier_2']:
            return 'tier_2'
        elif score >= thresholds['tier_3']:
            return 'tier_3'
        elif score >= thresholds['tier_4']:
            return 'tier_4'
        else:
            return 'excluded'
    
    current_tiers = [classify_tier(score, current_thresholds) for score in scores]
    current_distribution = Counter(current_tiers)
    
    print(f"\n📊 Current Distribution:")
    for tier, count in current_distribution.items():
        percentage = (count / len(scores)) * 100
        print(f"   {tier}: {count} companies ({percentage:.1f}%)")
    
    # Test alternative threshold configurations
    alternative_thresholds = {
        'More Selective': {'tier_1': 95, 'tier_2': 85, 'tier_3': 70, 'tier_4': 55},
        'Less Selective': {'tier_1': 85, 'tier_2': 70, 'tier_3': 55, 'tier_4': 40},
        'Even Distribution': {'tier_1': 90, 'tier_2': 75, 'tier_3': 60, 'tier_4': 45}
    }
    
    print(f"\n🔄 Alternative Threshold Scenarios:")
    
    threshold_results = {'Current': current_distribution}
    
    for scenario_name, thresholds in alternative_thresholds.items():
        alt_tiers = [classify_tier(score, thresholds) for score in scores]
        alt_distribution = Counter(alt_tiers)
        threshold_results[scenario_name] = alt_distribution
        
        print(f"\n   {scenario_name} ({thresholds}):")
        for tier, count in alt_distribution.items():
            percentage = (count / len(scores)) * 100
            print(f"     {tier}: {count} companies ({percentage:.1f}%)")
    
    # Visualize threshold scenarios
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    tier_order = ['tier_1', 'tier_2', 'tier_3', 'tier_4', 'excluded']
    colors = ['darkgreen', 'green', 'orange', 'red', 'gray']
    
    for i, (scenario_name, distribution) in enumerate(threshold_results.items()):
        if i < len(axes):
            tier_counts = [distribution.get(tier, 0) for tier in tier_order]
            
            axes[i].pie(tier_counts, labels=tier_order, colors=colors, autopct='%1.1f%%', startangle=90)
            axes[i].set_title(f'{scenario_name} Distribution')
    
    # Hide unused subplot
    if len(threshold_results) < len(axes):
        for i in range(len(threshold_results), len(axes)):
            axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("❌ No scoring results available for tier analysis")

## 🔬 Company Deep Dive Analysis

In [None]:
# Interactive company analysis
print("🔬 Company Deep Dive Analysis")
print("=" * 35)

def analyze_company_scoring(company_name):
    """Detailed analysis of a specific company's scoring"""
    
    # Find the company result
    company_result = None
    for result in scoring_results:
        if result['company_name'] == company_name:
            company_result = result
            break
    
    if not company_result:
        print(f"❌ Company '{company_name}' not found in results")
        return
    
    print(f"\n🏢 DEEP DIVE: {company_name}")
    print("=" * 50)
    
    # Basic info
    print(f"🎯 Total Score: {company_result['total_score']:.2f}")
    print(f"🏆 Tier: {company_result['tier_classification']}")
    
    # Component breakdown
    print(f"\n📊 Component Scores:")
    for component, score in company_result['component_scores'].items():
        weight = config['weights'].get(component, 0)
        weighted_contribution = score * weight
        print(f"   {component}: {score:.2f} (weight: {weight}) = {weighted_contribution:.2f}")
    
    # Keywords found
    print(f"\n🔤 Keywords Found:")
    total_keywords = 0
    for category, keywords in company_result['keywords_found'].items():
        if keywords:
            print(f"   {category}: {keywords}")
            total_keywords += len(keywords)
        else:
            print(f"   {category}: None")
    
    print(f"\n📈 Total Keywords Found: {total_keywords}")
    
    # Calculation details
    if 'calculation_details' in company_result:
        print(f"\n🧮 Detailed Calculation:")
        for component, details in company_result['calculation_details'].items():
            print(f"   {component}:")
            print(f"     Raw Score: {details.get('raw_score', 'N/A')}")
            print(f"     Weight: {details.get('weight', 'N/A')}")
            print(f"     Weighted Score: {details.get('weighted_score', 'N/A')}")
    
    # Recommendations
    print(f"\n💡 Recommendations:")
    
    if company_result['total_score'] < 60:
        print("   - Low priority: Consider for future nurturing campaigns")
    elif company_result['total_score'] < 75:
        print("   - Medium priority: Good for targeted marketing")
    elif company_result['total_score'] < 90:
        print("   - High priority: Strong prospect for outreach")
    else:
        print("   - Top priority: Immediate sales engagement recommended")
    
    # Scoring improvement suggestions
    lowest_component = min(company_result['component_scores'].items(), key=lambda x: x[1])
    print(f"   - Focus area: {lowest_component[0]} (score: {lowest_component[1]:.2f})")
    
    return company_result

# Analyze top and bottom companies
if scoring_results:
    # Sort by score
    sorted_results = sorted(scoring_results, key=lambda x: x['total_score'], reverse=True)
    
    print("🏆 TOP SCORING COMPANY:")
    top_company = analyze_company_scoring(sorted_results[0]['company_name'])
    
    print("\n" + "=" * 60)
    
    print("📉 LOWEST SCORING COMPANY:")
    bottom_company = analyze_company_scoring(sorted_results[-1]['company_name'])
    
else:
    print("❌ No scoring results available for deep dive analysis")

## 🛠️ Custom Analysis Tools

In [None]:
# Custom analysis functions
print("🛠️ Custom Analysis Tools")
print("=" * 25)

def test_custom_weights(custom_weights):
    """Test scoring with custom weights"""
    print(f"\n🧪 Testing custom weights: {custom_weights}")
    
    custom_scores = []
    for result in scoring_results:
        component_scores = result['component_scores']
        
        weighted_score = sum(
            component_scores.get(component, 0) * weight
            for component, weight in custom_weights.items()
        )
        
        custom_scores.append({
            'company': result['company_name'],
            'original_score': result['total_score'],
            'custom_score': weighted_score,
            'difference': weighted_score - result['total_score']
        })
    
    # Sort by custom score
    custom_scores.sort(key=lambda x: x['custom_score'], reverse=True)
    
    df_custom = pd.DataFrame(custom_scores)
    display(df_custom)
    
    return df_custom

def analyze_keyword_gaps():
    """Identify companies with low keyword matches"""
    print(f"\n🔍 Keyword Gap Analysis")
    
    keyword_gaps = []
    
    for result in scoring_results:
        total_keywords = sum(len(keywords) for keywords in result['keywords_found'].values())
        
        keyword_gaps.append({
            'company': result['company_name'],
            'total_score': result['total_score'],
            'total_keywords': total_keywords,
            'tech_score': result['component_scores'].get('technology_relevance', 0)
        })
    
    df_gaps = pd.DataFrame(keyword_gaps)
    df_gaps = df_gaps.sort_values('total_keywords')
    
    print("Companies with fewest keyword matches:")
    display(df_gaps.head())
    
    return df_gaps

# Example usage
print("💡 Available Tools:")
print("   - test_custom_weights(weights_dict)")
print("   - analyze_keyword_gaps()")
print("   - analyze_company_scoring(company_name)")

# Example: Test defense-heavy weighting
if scoring_results:
    print("\n🧪 Example: Defense-Heavy Weighting")
    defense_heavy = {
        'defense_contract_score': 0.6,
        'technology_relevance': 0.2,
        'compliance_indicators': 0.15,
        'firmographics': 0.05
    }
    test_custom_weights(defense_heavy)
    
    print("\n🔍 Keyword Gap Analysis:")
    analyze_keyword_gaps()

## 💾 Export Analysis Results

In [None]:
# Export analysis results
print("💾 Exporting Analysis Results")
print("=" * 30)

try:
    # Create results directory
    results_dir = Path("../data/research_results")
    results_dir.mkdir(parents=True, exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Export detailed scoring results
    if 'df_results' in locals():
        scoring_csv = results_dir / f"scoring_analysis_{timestamp}.csv"
        df_results.to_csv(scoring_csv, index=False)
        print(f"✅ Scoring results saved: {scoring_csv}")
    
    # Export weight analysis
    if 'scenario_results' in locals():
        weight_analysis_file = results_dir / f"weight_analysis_{timestamp}.json"
        weight_data = {
            'scenarios': weight_scenarios,
            'results': scenario_results,
            'companies': company_names
        }
        with open(weight_analysis_file, 'w') as f:
            json.dump(weight_data, f, indent=2)
        print(f"✅ Weight analysis saved: {weight_analysis_file}")
    
    # Export keyword analysis
    if 'all_keywords_found' in locals():
        keyword_file = results_dir / f"keyword_analysis_{timestamp}.json"
        keyword_data = {
            'keywords_by_category': all_keywords_found,
            'keyword_company_mapping': keyword_company_mapping
        }
        with open(keyword_file, 'w') as f:
            json.dump(keyword_data, f, indent=2)
        print(f"✅ Keyword analysis saved: {keyword_file}")
    
    # Create analysis summary
    summary_file = results_dir / f"scoring_analysis_summary_{timestamp}.txt"
    
    with open(summary_file, 'w') as f:
        f.write(f"ATOMUS SCORING ANALYSIS SUMMARY\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        if scoring_results:
            scores = [r['total_score'] for r in scoring_results]
            f.write(f"SCORE STATISTICS:\n")
            f.write(f"  Companies Analyzed: {len(scoring_results)}\n")
            f.write(f"  Mean Score: {np.mean(scores):.2f}\n")
            f.write(f"  Median Score: {np.median(scores):.2f}\n")
            f.write(f"  Score Range: {np.min(scores):.2f} - {np.max(scores):.2f}\n\n")
            
            tiers = [r['tier_classification'] for r in scoring_results]
            tier_counts = Counter(tiers)
            f.write(f"TIER DISTRIBUTION:\n")
            for tier, count in tier_counts.items():
                percentage = (count / len(tiers)) * 100
                f.write(f"  {tier}: {count} companies ({percentage:.1f}%)\n")
        
        f.write(f"\nCONFIGURATION USED:\n")
        f.write(f"  Weights: {config['weights']}\n")
        f.write(f"  Tier Thresholds: {config['tier_thresholds']}\n")
    
    print(f"✅ Analysis summary saved: {summary_file}")
    print(f"\n📁 All analysis files saved to: {results_dir}")
    
except Exception as e:
    print(f"❌ Error saving analysis results: {str(e)}")

## 🎯 Analysis Complete

This notebook has provided comprehensive analysis of the Atomus scoring algorithm including:

- ✅ **Score Distribution Analysis** - Understanding how companies are scored
- ✅ **Keyword Effectiveness** - Which keywords are most valuable
- ✅ **Weight Sensitivity** - How weight changes affect rankings
- ✅ **Tier Optimization** - Optimal threshold configurations
- ✅ **Company Deep Dives** - Detailed individual analysis
- ✅ **Custom Analysis Tools** - Functions for ongoing optimization

Use the insights from this analysis to refine your scoring algorithm and improve prospect identification!