In [3]:
# Autonomous AI Sales Agent
# Created: April 29, 2025

# Install required packages
!pip install -q playwright openai google-auth-oauthlib google-auth-httplib2 google-api-python-client gspread oauth2client tqdm bs4
!playwright install chromium

# Import libraries
import os
import time
import random
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import openai
from google.colab import auth
import gspread
from oauth2client.client import GoogleCredentials
from googleapiclient.discovery import build
from email.mime.text import MIMEText
import base64
from tqdm import tqdm
import datetime

# Import playwright after installation
from playwright.sync_api import sync_playwright

# Configuration - Update these variables
OPENAI_API_KEY = ""  # Your OpenAI API key
GOOGLE_SHEET_NAME = "AI_Sales_Agent_DB"  # Name of your Google Sheet
MAX_EMAILS_PER_DAY = 15                  # Limit emails to avoid spam flags
EMAIL_WAIT_MIN = 5                       # Minimum minutes between emails
EMAIL_WAIT_MAX = 15                      # Maximum minutes between emails
FOLLOW_UP_DAYS = 5                       # Days to wait before follow-up
YOUR_NAME = "Your Name"                  # Your name for the email signature
YOUR_COMPANY = "Your Company"            # Your company name
YOUR_EMAIL = "your.email@gmail.com"      # Your Gmail address

# Initialize APIs
openai.api_key = OPENAI_API_KEY

# 1. Authenticate with Google APIs
print("Authenticating with Google...")
auth.authenticate_user()
gc = gspread.authorize(GoogleCredentials.get_application_default())
service = build('gmail', 'v1', credentials=GoogleCredentials.get_application_default())

# Create or open Google Sheet
try:
    sheet = gc.open(GOOGLE_SHEET_NAME)
    leads_sheet = sheet.worksheet("Leads")
    emails_sheet = sheet.worksheet("Emails")
    print(f"Connected to existing Google Sheet '{GOOGLE_SHEET_NAME}'")
except:
    print(f"Creating new Google Sheet '{GOOGLE_SHEET_NAME}'...")
    sheet = gc.create(GOOGLE_SHEET_NAME)

    # Create Leads worksheet
    leads_sheet = sheet.worksheet("Sheet1")
    leads_sheet.update_title("Leads")
    leads_sheet.update([
        ["Company", "Website", "Industry", "Contact Name", "Contact Email",
         "Pain Points", "Status", "Last Contact", "Notes", "Source"]
    ])

    # Create Emails worksheet
    emails_sheet = sheet.add_worksheet(title="Emails", rows=1000, cols=10)
    emails_sheet.update([
        ["Timestamp", "Company", "Recipient", "Subject", "Email Body",
         "Email Type", "Status", "Reply Date", "Reply Content", "Notes"]
    ])

    print("Google Sheet created with 'Leads' and 'Emails' worksheets")

# 2. Lead Discovery Functions
def scrape_crunchbase_companies(keyword, num_results=10):
    """
    Scrape companies from Crunchbase based on keyword
    Note: In a production system, you would use the Crunchbase API or a service like SerpAPI
    """
    print(f"Scraping Crunchbase for '{keyword}'...")
    companies = []

    # This is a simplified example - in production use Crunchbase API or SerpAPI
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()

            try:
                page.goto(f"https://www.crunchbase.com/discover/organization.companies/field/organizations/categories/{keyword}")
                time.sleep(5)  # Wait for page to load

                # Extract company data
                company_elements = page.query_selector_all(".component--grid-column-0")

                for i, element in enumerate(company_elements):
                    if i >= num_results:
                        break

                    company_name = element.query_selector(".identifier-label").inner_text()
                    website = "https://example.com"  # Placeholder; would extract real URL in production
                    industry = keyword

                    companies.append({
                        "Company": company_name,
                        "Website": website,
                        "Industry": industry,
                        "Contact Name": "",
                        "Contact Email": "",
                        "Pain Points": "",
                        "Status": "New",
                        "Last Contact": "",
                        "Notes": f"Scraped from Crunchbase search for '{keyword}'",
                        "Source": "Crunchbase"
                    })
            except Exception as e:
                print(f"Error during scraping: {e}")
            finally:
                browser.close()
    except Exception as e:
        print(f"Playwright error: {e}")
        # Fallback to sample data for testing
        return get_sample_companies(keyword, num_results)

    print(f"Found {len(companies)} companies")

    # If we didn't find any companies, use sample data
    if len(companies) == 0:
        return get_sample_companies(keyword, num_results)

    return companies

def get_sample_companies(keyword, num_results=5):
    """
    Generate sample company data for testing when scraping fails
    """
    print(f"Generating sample companies for '{keyword}'...")

    # Sample companies by industry
    sample_companies = {
        "saas": ["Salesforce", "HubSpot", "Slack", "Zoom", "Asana"],
        "fintech": ["Stripe", "Square", "Robinhood", "Coinbase", "Plaid"],
        "healthtech": ["Oscar Health", "Teladoc", "Hims & Hers", "23andMe", "Zocdoc"],
        "ecommerce": ["Shopify", "BigCommerce", "Etsy", "Wayfair", "Chewy"],
        "ai": ["OpenAI", "DeepMind", "Anthropic", "Scale AI", "Databricks"]
    }

    # Get companies for the keyword, or use generic ones if keyword not found
    company_names = sample_companies.get(keyword.lower(),
        ["TechCorp", "InnovateAI", "DataSystems", "CloudSolutions", "NextGenTech"])

    # Limit to requested number
    company_names = company_names[:num_results]

    companies = []
    for company_name in company_names:
        domain = company_name.lower().replace(" ", "").replace("&", "and") + ".com"
        companies.append({
            "Company": company_name,
            "Website": f"https://{domain}",
            "Industry": keyword,
            "Contact Name": "",
            "Contact Email": f"info@{domain}",
            "Pain Points": "",
            "Status": "New",
            "Last Contact": "",
            "Notes": f"Sample data for testing - {keyword} industry",
            "Source": "Sample Data"
        })

    print(f"Generated {len(companies)} sample companies")
    return companies

def find_company_emails(company_website):
    """
    Find potential email addresses from company website
    """
    try:
        response = requests.get(f"https://{company_website.replace('https://', '').replace('http://', '')}")
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find emails using regex
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        emails = re.findall(email_pattern, response.text)

        # Filter for company domain emails
        domain = company_website.replace('https://', '').replace('http://', '').split('/')[0]
        company_emails = [email for email in emails if domain in email]

        return company_emails[0] if company_emails else ""
    except:
        return ""

def analyze_company_website(url):
    """
    Analyze company website to extract relevant information using GPT
    """
    try:
        # Clean URL
        if not url.startswith('http'):
            url = 'https://' + url

        # Get website content
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from main content areas
        text_content = ""
        for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'li']):
            text_content += tag.get_text() + "\n"

        # Truncate if too long
        if len(text_content) > 4000:
            text_content = text_content[:4000]

        # Use GPT to analyze the content
        prompt = f"""
        The following is text from a company website. Analyze it and provide:
        1. A brief description of what the company does (1-2 sentences)
        2. The industry they're in
        3. Potential pain points they might have that AI solutions could help with
        4. The target audience/customers of this company

        Website content:
        {text_content}
        """

        completion = openai.chat.completions.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": prompt}]
        )

        analysis = completion.choices[0].message.content
        return analysis
    except Exception as e:
        return f"Error analyzing website: {str(e)}"

def add_leads_to_sheet(companies):
    """
    Add the discovered leads to Google Sheet
    """
    # Get existing companies to avoid duplicates
    existing_data = leads_sheet.get_all_values()
    existing_companies = [row[0] for row in existing_data[1:]] if len(existing_data) > 1 else []

    # Filter out duplicates
    new_companies = [c for c in companies if c["Company"] not in existing_companies]

    if not new_companies:
        print("No new companies to add")
        return 0

    # Prepare rows for Google Sheet
    rows = []
    for company in new_companies:
        rows.append([
            company["Company"],
            company["Website"],
            company["Industry"],
            company["Contact Name"],
            company["Contact Email"],
            company["Pain Points"],
            company["Status"],
            company["Last Contact"],
            company["Notes"],
            company["Source"]
        ])

    # Add to sheet
    leads_sheet.append_rows(rows)
    print(f"Added {len(rows)} new companies to the sheet")
    return len(rows)

# 3. Email Generation Functions
def generate_email(company_name, contact_name, industry, pain_points):
    """
    Generate a personalized cold email using GPT
    """
    # Default values for empty fields
    contact_greeting = f"Hi {contact_name}" if contact_name else "Hi there"
    company_context = f" at {company_name}" if company_name else ""
    industry_context = f" in the {industry} industry" if industry else ""
    pain_point_context = f", particularly with {pain_points}" if pain_points else ""

    prompt = f"""
    Write a short, personalized cold email:
    - To: A potential client{company_context}{industry_context}
    - From: {YOUR_NAME} at {YOUR_COMPANY}
    - Purpose: Offering AI automation solutions{pain_point_context}
    - Tone: Professional, helpful, not pushy
    - Length: Short (4-5 sentences maximum)
    - Include: One specific value proposition and a simple call to action

    The email should start with "{contact_greeting}," and should be written in a way that shows I've done my research on their company. Don't use obvious templated language. Make it sound personal and targeted.
    """

    try:
        completion = openai.chat.completions.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": prompt}]
        )

        email_body = completion.choices[0].message.content

        # Add signature
        email_body += f"\n\nBest regards,\n{YOUR_NAME}\n{YOUR_COMPANY}"

        return email_body
    except Exception as e:
        print(f"Error generating email: {e}")
        return ""

def generate_follow_up_email(company_name, original_email):
    """
    Generate a follow-up email based on the original one
    """
    prompt = f"""
    Write a short follow-up email based on this original email I sent to {company_name}:

    "{original_email}"

    The follow-up should:
    - Be brief (2-3 sentences)
    - Politely reference the previous email
    - Add a new angle or value proposition
    - Include a simple call to action
    - Not sound desperate or pushy

    Don't include a new signature, I'll add that myself.
    """

    try:
        completion = openai.chat.completions.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": prompt}]
        )

        email_body = completion.choices[0].message.content

        # Add signature
        email_body += f"\n\nBest regards,\n{YOUR_NAME}\n{YOUR_COMPANY}"

        return email_body
    except Exception as e:
        print(f"Error generating follow-up email: {e}")
        return ""

# 4. Email Sending Functions
def send_email(recipient, subject, body):
    """
    Send email via Gmail API
    """
    try:
        message = MIMEText(body)
        message['to'] = recipient
        message['subject'] = subject

        # Encode the message
        raw_message = base64.urlsafe_b64encode(message.as_string().encode()).decode()

        # Send the message
        send_message = service.users().messages().send(userId='me', body={'raw': raw_message}).execute()

        print(f"Email sent to {recipient}, Message ID: {send_message['id']}")
        return True, send_message['id']
    except Exception as e:
        print(f"Error sending email: {e}")
        return False, str(e)

def log_email(company, recipient, subject, body, email_type, status):
    """
    Log the sent email in Google Sheet
    """
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    emails_sheet.append_row([
        timestamp,
        company,
        recipient,
        subject,
        body,
        email_type,  # Initial or Follow-up
        status,
        "",  # Reply Date
        "",  # Reply Content
        ""   # Notes
    ])

# 5. Main Execution Functions
def discover_new_leads(keywords, results_per_keyword=5):
    """
    Discover new leads based on keywords
    """
    all_companies = []

    for keyword in keywords:
        companies = scrape_crunchbase_companies(keyword, results_per_keyword)
        all_companies.extend(companies)

    # Add the new leads to the sheet
    added = add_leads_to_sheet(all_companies)
    print(f"Added {added} new leads to the database")

def enrich_lead_data():
    """
    Enrich existing lead data with email addresses and additional info
    """
    # Get all leads
    leads_data = leads_sheet.get_all_records()

    # Filter for leads that need enrichment
    leads_to_enrich = [lead for lead in leads_data if
                      (not lead['Contact Email'] or not lead['Pain Points']) and
                      lead['Website'] and lead['Status'] == 'New']

    if not leads_to_enrich:
        print("No leads to enrich")
        return

    print(f"Enriching data for {len(leads_to_enrich)} leads...")

    for i, lead in enumerate(tqdm(leads_to_enrich)):
        row_index = i + 2  # +2 because Google Sheets is 1-indexed and we have a header row

        # Find email if missing
        if not lead['Contact Email'] and lead['Website']:
            email = find_company_emails(lead['Website'])
            if email:
                leads_sheet.update_cell(row_index, 5, email)  # Column E
                lead['Contact Email'] = email

        # Analyze website for pain points if missing
        if not lead['Pain Points'] and lead['Website']:
            analysis = analyze_company_website(lead['Website'])
            leads_sheet.update_cell(row_index, 6, analysis)  # Column F
            lead['Pain Points'] = analysis

        # Update status
        leads_sheet.update_cell(row_index, 7, "Enriched")  # Column G

        # Don't hit APIs too fast
        time.sleep(random.uniform(1, 3))

def send_initial_emails():
    """
    Send initial cold emails to enriched leads
    """
    # Get all leads
    leads_data = leads_sheet.get_all_records()

    # Filter for leads ready for initial contact
    leads_to_contact = [lead for lead in leads_data if
                       lead['Status'] == 'Enriched' and
                       lead['Contact Email']]

    if not leads_to_contact:
        print("No leads ready for initial contact")
        return

    # Count how many emails we've already sent today
    today = datetime.datetime.now().strftime("%Y-%m-%d")
    emails_today = [e for e in emails_sheet.get_all_records()
                   if e['Timestamp'].startswith(today)]
    emails_sent_today = len(emails_today)

    remaining_quota = MAX_EMAILS_PER_DAY - emails_sent_today
    if remaining_quota <= 0:
        print(f"Email sending quota ({MAX_EMAILS_PER_DAY}) reached for today")
        return

    # Limit the number of emails to send
    leads_to_contact = leads_to_contact[:remaining_quota]

    print(f"Sending initial emails to {len(leads_to_contact)} leads...")

    for i, lead in enumerate(leads_to_contact):
        # Generate personalized email
        email_body = generate_email(
            lead['Company'],
            lead['Contact Name'],
            lead['Industry'],
            lead['Pain Points']
        )

        if not email_body:
            continue

        # Create subject line
        subject = f"AI automation solutions for {lead['Company']}"

        # Send the email
        success, message_id = send_email(lead['Contact Email'], subject, email_body)

        if success:
            # Log the email
            log_email(lead['Company'], lead['Contact Email'], subject, email_body, 'Initial', 'Sent')

            # Update lead status and last contact date
            row_index = i + 2  # +2 because Google Sheets is 1-indexed and we have a header row
            today_str = datetime.datetime.now().strftime("%Y-%m-%d")
            leads_sheet.update_cell(row_index, 7, "Contacted")  # Status
            leads_sheet.update_cell(row_index, 8, today_str)    # Last Contact

            # Wait between emails
            wait_time = random.uniform(EMAIL_WAIT_MIN, EMAIL_WAIT_MAX)
            print(f"Waiting {wait_time:.1f} minutes before next email...")
            time.sleep(wait_time * 60)  # Convert to seconds

def send_follow_up_emails():
    """
    Send follow-up emails to leads who haven't responded
    """
    # Get all leads
    leads_data = leads_sheet.get_all_records()

    # Filter for leads that need follow-up
    today = datetime.datetime.now().date()

    leads_to_follow_up = []
    for lead in leads_data:
        if lead['Status'] == 'Contacted' and lead['Last Contact']:
            try:
                last_contact = datetime.datetime.strptime(lead['Last Contact'], "%Y-%m-%d").date()
                days_since_contact = (today - last_contact).days

                if days_since_contact >= FOLLOW_UP_DAYS:
                    leads_to_follow_up.append(lead)
            except:
                pass

    if not leads_to_follow_up:
        print("No leads need follow-up today")
        return

    # Count how many emails we've already sent today
    today_str = datetime.datetime.now().strftime("%Y-%m-%d")
    emails_today = [e for e in emails_sheet.get_all_records()
                   if e['Timestamp'].startswith(today_str)]
    emails_sent_today = len(emails_today)

    remaining_quota = MAX_EMAILS_PER_DAY - emails_sent_today
    if remaining_quota <= 0:
        print(f"Email sending quota ({MAX_EMAILS_PER_DAY}) reached for today")
        return

    # Limit the number of emails to send
    leads_to_follow_up = leads_to_follow_up[:remaining_quota]

    print(f"Sending follow-up emails to {len(leads_to_follow_up)} leads...")

    # Get previous emails sent to these companies
    all_emails = emails_sheet.get_all_records()

    for i, lead in enumerate(leads_to_follow_up):
        # Find the original email
        original_emails = [e for e in all_emails if e['Company'] == lead['Company'] and e['Email Type'] == 'Initial']

        if not original_emails:
            continue

        original_email = original_emails[0]['Email Body']

        # Generate follow-up email
        follow_up_body = generate_follow_up_email(lead['Company'], original_email)

        if not follow_up_body:
            continue

        # Create subject line - include "Re:" to make it look like a reply
        subject = f"Re: AI automation solutions for {lead['Company']}"

        # Send the email
        success, message_id = send_email(lead['Contact Email'], subject, follow_up_body)

        if success:
            # Log the email
            log_email(lead['Company'], lead['Contact Email'], subject, follow_up_body, 'Follow-up', 'Sent')

            # Update lead status and last contact date
            row_index = leads_data.index(lead) + 2  # +2 for header row and 1-indexing
            today_str = datetime.datetime.now().strftime("%Y-%m-%d")
            leads_sheet.update_cell(row_index, 7, "Follow-up")  # Status
            leads_sheet.update_cell(row_index, 8, today_str)    # Last Contact

            # Wait between emails
            wait_time = random.uniform(EMAIL_WAIT_MIN, EMAIL_WAIT_MAX)
            print(f"Waiting {wait_time:.1f} minutes before next email...")
            time.sleep(wait_time * 60)  # Convert to seconds

# 6. Main Execution Block
def main():
    print("Starting Autonomous AI Sales Agent...")

    # 1. Discover new leads from specific industries
    keywords = ["saas", "fintech", "healthtech", "ecommerce", "ai"]
    discover_new_leads(keywords, results_per_keyword=3)

    # 2. Enrich lead data
    enrich_lead_data()

    # 3. Send initial emails
    send_initial_emails()

    # 4. Send follow-up emails
    send_follow_up_emails()

    print("AI Sales Agent run completed!")

# Test functions
def test_openai_connection():
    """Test if OpenAI API connection is working"""
    try:
        print("Testing OpenAI API connection...")
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hello, this is a test message. Please respond with 'API connection successful'."}]
        )
        result = response.choices[0].message.content
        print(f"OpenAI API test result: {result}")
        return True
    except Exception as e:
        print(f"OpenAI API connection error: {e}")
        return False

def test_google_connection():
    """Test if Google API connections are working"""
    try:
        print("Testing Google Sheets connection...")
        # Test Google Sheets
        test_values = leads_sheet.get_all_values()
        print(f"Google Sheets connection successful: Retrieved {len(test_values)} rows")

        print("Testing Gmail API connection...")
        # Test Gmail API
        profile = service.users().getProfile(userId='me').execute()
        print(f"Gmail API connection successful: Connected to {profile['emailAddress']}")
        return True
    except Exception as e:
        print(f"Google API connection error: {e}")
        return False

def test_gpt_email_generation():
    """Test email generation with GPT"""
    try:
        print("Testing email generation with GPT...")
        test_email = generate_email(
            company_name="Example Corp",
            contact_name="John Smith",
            industry="Software",
            pain_points="customer support automation"
        )
        print("\nSample generated email:")
        print("=" * 50)
        print(test_email)
        print("=" * 50)
        return True
    except Exception as e:
        print(f"Email generation error: {e}")
        return False

def run_test_mode():
    """Run tests to verify all connections are working"""
    print("==== RUNNING DIAGNOSTICS ====")

    # Test OpenAI API
    openai_ok = test_openai_connection()

    # Test Google APIs
    try:
        google_ok = test_google_connection()
    except NameError:
        print("Google authentication not completed. Please run the authentication cell first.")
        google_ok = False

    # Test email generation
    if openai_ok:
        email_ok = test_gpt_email_generation()
    else:
        email_ok = False

    # Overall status
    if openai_ok and google_ok and email_ok:
        print("\n✅ All systems operational!")
        print("You can now run the main() function to start the sales agent.")
    else:
        print("\n⚠️ Some tests failed. Check the errors above.")

        if not openai_ok:
            print("- OpenAI API connection failed. Check your API key.")

        if not google_ok:
            print("- Google API connection failed. Make sure you've completed authentication.")

        if not email_ok and openai_ok:
            print("- Email generation failed. Check the API model availability.")

def quick_test():
    """Run a simplified test that only checks the OpenAI connection"""
    print("Running OpenAI API quick test...")

    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Generate a single sentence describing an AI sales agent."}]
        )
        result = response.choices[0].message.content
        print("\n✅ OpenAI API test successful!")
        print(f"Response: {result}")
    except Exception as e:
        print(f"\n❌ OpenAI API test failed: {e}")
        if "incorrect api key provided" in str(e).lower():
            print("The API key appears to be invalid. Please check your API key.")
        elif "rate limit" in str(e).lower():
            print("You've hit a rate limit. Please try again in a few moments.")

# Run the main function or test mode
if __name__ == "__main__":
    # Choose which mode to run:
    # - "quick_test": Just test the OpenAI API connection (no Google auth needed)
    # - "test": Run all diagnostics (requires Google auth)
    # - "main": Run the full system (requires Google auth)
    MODE = "quick_test"

    if MODE == "quick_test":
        quick_test()
    elif MODE == "test":
        run_test_mode()
    elif MODE == "main":
        main()

# For scheduling, you can either:
# 1. Run this notebook manually on a daily basis
# 2. Use Google Colab Pro's scheduled runs
# 3. Set up a trigger using a cloud function or service like pythonanywhere

Authenticating with Google...


TypeError: Credentials need to be from either oauth2client or from google-auth.