# AI-Driven Company Intelligence Analysis

This notebook provides an interactive environment for analyzing company data and generating insights.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from company_intelligence import CompanyIntelligence
import os
from dotenv import load_dotenv

# Load environment variables (for API key)
load_dotenv()

In [None]:
# Initialize the analyzer
# Replace with your data file path if different
data_path = 'champions_group_data.xlsx'
api_key = os.getenv('OPENAI_API_KEY')  # Optional: for LLM insights

analyzer = CompanyIntelligence(data_path, api_key=api_key)

## Step 1: Explore the Data

**Note:** Inactive companies are automatically filtered out during data loading. Only companies with "Active" status are included in the analysis.

In [None]:
# Explore the raw data
df_explored = analyzer.explore_data()

# Display first few rows
analyzer.df.head(10)

In [None]:
# Check data types and missing values
print("Data Info:")
analyzer.df.info()

print("\nMissing Values:")
missing = analyzer.df.isnull().sum()
print(missing[missing > 0])

## Step 2: Preprocess Data

**Note:** All numeric data will be automatically transformed using log base 10 (log10) before analysis. This helps normalize the data distribution and handle outliers. The transformation uses log10(1 + abs(x)) to handle zeros and negative values.

In [None]:
# Preprocess data
# Exclude any columns you don't want in the analysis (e.g., IDs, notes)
exclude_cols = []  # Add column names here if needed, e.g., ['Company_ID', 'Notes']

analyzer.preprocess_data(exclude_cols=exclude_cols)

## Step 3: Determine Optimal Clusters

In [None]:
# Find optimal number of clusters
optimal_k = analyzer.determine_optimal_clusters(max_k=10)
print(f"\nRecommended number of clusters: {optimal_k}")

## Step 4: Perform Clustering

In [None]:
# Perform clustering
# Option 1: Use optimal number (recommended)
analyzer.perform_clustering()

# Option 2: Specify custom number
# analyzer.perform_clustering(n_clusters=5)

In [None]:
# View cluster distribution
cluster_dist = analyzer.df['Cluster'].value_counts().sort_index()
print("Cluster Distribution:")
print(cluster_dist)

# Visualize
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
cluster_dist.plot(kind='bar')
plt.title('Company Distribution Across Segments')
plt.xlabel('Cluster')
plt.ylabel('Number of Companies')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Step 5: Analyze Clusters

In [None]:
# Analyze cluster characteristics
cluster_analysis = analyzer.analyze_clusters()

# Display summary for each cluster
for cluster_id, info in cluster_analysis.items():
    print(f"\n{'='*60}")
    print(f"Cluster {cluster_id}: {info['size']} companies ({info['percentage']:.1f}%)")
    print(f"{'='*60}")

## Step 6: Compare Clusters

In [None]:
# Compare clusters across key features
comparison = analyzer.compare_clusters()
comparison

In [None]:
# Compare specific feature (replace 'Revenue' with actual column name)
numeric_cols = analyzer.df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
    feature = numeric_cols[0]  # Use first numeric column
    print(f"Comparing '{feature}' across clusters:")
    feature_comparison = analyzer.compare_clusters(feature=feature)
    print(feature_comparison)

## Step 7: Identify Patterns

In [None]:
# Identify patterns, outliers, and anomalies
patterns = analyzer.identify_patterns()

print("Outliers Detected:")
for outlier in patterns['outliers'][:10]:
    print(f"  {outlier['feature']}: {outlier['count']} companies ({outlier['percentage']:.1f}%)")

## Step 8: Generate Insights

In [None]:
# Generate LLM-powered insights (or rule-based if LLM unavailable)
insights = analyzer.generate_llm_insights(cluster_analysis, patterns)
print(insights)

## Step 9: Create Visualizations

In [None]:
# Generate all visualizations
analyzer.visualize_results()
print("Visualizations saved!")

In [None]:
# Create custom visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# Example: Feature distribution by cluster
numeric_cols = analyzer.df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'Cluster']

if numeric_cols:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()
    
    for idx, feature in enumerate(numeric_cols[:4]):
        analyzer.df.boxplot(column=feature, by='Cluster', ax=axes[idx])
        axes[idx].set_title(f'{feature} by Cluster')
        axes[idx].set_xlabel('Cluster')
    
    plt.suptitle('Feature Comparison Across Clusters', y=1.02)
    plt.tight_layout()
    plt.show()

## Step 10: Generate Report

In [None]:
# Generate comprehensive report
report = analyzer.generate_report(cluster_analysis, patterns, insights)
print("Report generated and saved to company_intelligence_report.txt")

## Step 11: Export Results

In [None]:
# Export data with cluster labels
output_file = 'companies_with_segments.csv'
analyzer.df.to_csv(output_file, index=False)
print(f"Results exported to {output_file}")

# Display sample of results
analyzer.df[['Cluster'] + list(analyzer.df.columns[:5])].head(10)

## Quick Full Analysis

Alternatively, run the complete analysis pipeline in one go:

In [None]:
# Run complete analysis pipeline
results = analyzer.run_full_analysis(n_clusters=None)  # None = auto-determine

# Access results
print("\nAnalysis complete!")
print(f"Clusters identified: {len(set(analyzer.clusters))}")
print("\nCheck generated files for detailed results.")