In [29]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

In [30]:
# Create separate Functions for Each column to be filled

def get_company_description(company_name, response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        about_section = soup.find('span', {'class': 'description ng-star-inserted'})
        if about_section:
            description = about_section.text.strip() if about_section.text else "No company description available"
            return description
        else:
            return f"No Description found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"

In [31]:
def find_sector_info(company_name, response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        sector = soup.find('span', {'class': 'component--field-formatter field-type-enum ng-star-inserted'})
        if sector:
            description = sector.text.strip() if sector.text else "No sector information available"
            return description
        else:
            return f"No information found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


In [32]:
def find_industry_info(company_name, response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        chip_texts = soup.find_all('div', {'class': 'chip-text'})
        if chip_texts:
            descriptions = [chip_text.text.strip() for chip_text in chip_texts]
            industry_str = ', '.join(descriptions)
            return industry_str
        else:
            return f"No Information found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"

In [33]:
def get_last_funding(company_name, response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        a_elements = soup.find_all('a', {'class': 'component--field-formatter field-type-enum accent highlight-color-contrast-light ng-star-inserted'})
        if a_elements and len(a_elements) >= 2:
            description = a_elements[1].text.strip()
            return description
        else:
            return f"No Funding found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


In [34]:
def find_competing_companies(company_name):
    url = f'https://www.crunchbase.com/organization/{company_name.lower()}/org_similarity_overview'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        matching_elements = soup.find_all('div', {'class': 'reasons-container ng-star-inserted'})
        com_name = soup.find('h1', {'class': 'profile-name'})
        if matching_elements and com_name:
            results = [element.text.strip() for element in matching_elements]
            comp_name = com_name.text.strip()
            res = []
            for text in results:
                pattern = re.compile(comp_name + r'\s+and\s+(\w+)')
                matches = pattern.findall(text)
                res.extend(matches)
            competitors_str = ', '.join(res)
            return competitors_str
        else:
            return f"No Competitors found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"

In [35]:
def get_products_and_services(company_name, response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        p_s = soup.find('span', {'class': 'description has-overflow ng-star-inserted'})
        if p_s:
            description = p_s.text.strip() if p_s.text else "No product/service information available"
            return description
        else:
            return f"No Products/Services found for {company_name}"
    else:
        return f"Failed to retrieve information. Status code: {response.status_code}"


In [42]:
def retrieve_company_info(name):
    temp_name = name.lower().replace(' ', '-')
    url = f'https://www.crunchbase.com/organization/{temp_name}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    print(response.status_code)
    if response.status_code == 200:
        des1 = get_company_description(name, response)
        des2 = find_sector_info(name, response)
        des3 = find_industry_info(name, response)
        des4 = get_last_funding(name, response)
        des5 = find_competing_companies(temp_name)
        des6 = get_products_and_services(name, response)
        return {
            'Company Name': name,
            'Company Description': des1,
            'Sector': des2,
            'Industry': des3,
            'Funding': des4,
            'Products/Services Description': des6,
            'Competitors': des5
        }
    else:
        print(f"Failed to retrieve information for {name}")
        return None

In [43]:
def generate_excel(company_names):
    data = []
    for company_name in company_names:
        company_info = retrieve_company_info(company_name)
        time.sleep(4)
        if company_info:
            data.append(company_info)

    df = pd.DataFrame(data)
    excel_filename = 'op_file.xlsx'
    df.to_excel(excel_filename, index=False)
    print(f'Excel file "{excel_filename}" generated successfully.')

company_names = [
      'Microsoft', 'Amazon', 'Alphabet', 'Facebook', 'Tesla'
    ]
generate_excel(company_names)


200
200
200
200
200
200
200
200
200
200
Excel file "output_file.xlsx" generated successfully.
