# Company Email Scraper

This notebook runs the email scraper to find contact information for companies listed in companies.txt using Google Custom Search API.

In [None]:
from scraper import scrape_companies
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

## Run the Scraper

In [None]:
# Record start time
start_time = datetime.now()

# Run the scraper
results_df = scrape_companies()

# Calculate runtime
runtime = datetime.now() - start_time

if results_df is not None:
    # Save results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'sponsorship_contacts_{timestamp}.csv'
    results_df.to_csv(filename, index=False)
    
    print(f"\nResults saved to {filename}")
    print(f"Runtime: {runtime}")
    print(f"Processed {len(results_df)} companies")
    
    # Display first few results
    display(results_df.head())

## Analyze Results

In [None]:
if results_df is not None:
    # Calculate statistics
    total = len(results_df)
    emails_found = len(results_df[~results_df['Emails'].str.startswith(('No ', 'Error'))])
    no_website = len(results_df[results_df['Emails'] == 'No website found'])
    no_emails = len(results_df[results_df['Emails'] == 'Not found'])
    errors = len(results_df[results_df['Emails'].str.startswith('Error')])
    
    # Create pie chart
    labels = ['Emails Found', 'No Website', 'No Emails', 'Errors']
    sizes = [emails_found, no_website, no_emails, errors]
    colors = ['#2ecc71', '#e74c3c', '#f1c40f', '#95a5a6']
    
    plt.figure(figsize=(10, 7))
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')
    plt.title('Scraping Results Distribution')
    plt.axis('equal')
    plt.show()
    
    print(f"\nDetailed Results:")
    print(f"Total companies processed: {total}")
    print(f"Companies with emails found: {emails_found}")
    print(f"Companies with no website: {no_website}")
    print(f"Companies with no emails: {no_emails}")
    print(f"Companies with errors: {errors}")

## Email Analysis

In [None]:
if results_df is not None:
    # Get rows with actual emails
    email_rows = results_df[~results_df['Emails'].str.startswith(('No ', 'Error'))]
    
    if len(email_rows) > 0:
        # Calculate average emails per company
        avg_emails = email_rows['Emails'].str.count(',').mean() + 1
        
        # Find companies with most emails
        email_counts = email_rows['Emails'].str.count(',') + 1
        top_companies = pd.DataFrame({
            'Company': email_rows['Company'],
            'Email Count': email_counts
        }).sort_values('Email Count', ascending=False).head(10)
        
        print(f"\nEmail Statistics:")
        print(f"Average emails per company: {avg_emails:.1f}")
        print(f"\nTop 10 Companies by Number of Emails Found:")
        display(top_companies)