In [253]:
import requests
import seaborn
import pandas as pd
from bs4 import BeautifulSoup

# YCombinator Top 100 Companies
## Ethnic Profiling of the founders of the Top 100 Companies

### Process
    - Data Extraction
        - Scrape the data from the website and store it.
    - Data Processing
        - Extract all the founders.
    - Data Analysis
        - Profile using names of founders.

In [254]:
"""
    YCombinator Top 100 List of Companies
        Ranked by Valuation
            - arush15june 2019/01/24
"""

URL = 'https://www.ycombinator.com/topcompanies/'

In [255]:
def get_text(URL):
    """
        Get HTML Content from `URL`
    """
    r = requests.get(URL)
    return r.text

In [256]:
def get_soup(text):
    """
        Get Soup from HTML Content
    """
    try:
        soup = BeautifulSoup(text)
        return soup
    except Exception as e:
        return None, e

In [257]:
"""
    lambdas from extracting item values from <td>
"""
process = {
    'name': (lambda item: item.find('b').text),
    'rank': (lambda item: item.find('span').text),
    'description': (lambda item: item.find('p').text),
    'founders': (lambda item: [li.text for li in item.find_all('li')]),
    'sector': (lambda item: item.find('p').text),
    'jobs created': (lambda item: int(item.find('span').text.replace(',', ''))),
    'batch': (lambda item: item.find('span').text),
    
}

def handle_row(row):
    """
        Handle each <tr> of the <table> containing the list.
        :param row: BeautifulSoup <tr> element
        :return dict row_data: dictionary of values from row.
    """
    row_data = {}
    keys = list(process.keys())
    
    for i, (item) in enumerate(row.find_all('td')):
        key = keys[i]
        row_data[key] = process[key](item)
    
    return row_data    

In [258]:
soup = get_soup(get_text(URL))

In [259]:
top_table = soup.find('table')

In [260]:
def process_table(table):
    """
        Extract Headers from <thead>.
        Extract data from the rows in <tr> inside the table.
        :param table: beautifulsoup object of the table containing data.
        :return table_data: dict of lists of data extracted.
    """
    table_headers = table.find_all('th')
    table_data = {}
    for header in table_headers:
            text = header.text.strip().lower()
            table_data[text] = []
            
    data_rows = table.find_all('tr')
    
    for row in data_rows[1:]:
        row_data = handle_row(row)
        for key, val in row_data.items():
            table_data[key].append(val)
    
    return table_data

In [261]:
data = process_table(top_table)

In [262]:
tableDF = pd.DataFrame(data)
tableDF.to_csv('top100.csv', index=False)
tableDF

Unnamed: 0,name,rank,description,founders,sector,jobs created,batch
0,Airbnb,#01,Airbnb is a global travel community that offer...,"[Brian Chesky, Joe Gebbia, Nathan Blecharczyk]",Consumer Goods and Services,4000,W2009
1,Stripe,#02,Stripe builds economic infrastructure for the ...,"[John Collison, Patrick Collison]",Financial Technology and Services,1300,S2010
2,Cruise,#03,Cruise is building autonomous vehicles to safe...,"[Kyle Vogt, Daniel Kan]",Automotive,1000,W2014
3,Dropbox,#04,Dropbox is a global collaboration platform whe...,"[Arash Ferdowsi, Drew Houston]",B2B Software and Services,1858,S2007
4,Coinbase,#05,Coinbase is a digital currency wallet and plat...,"[Brian Armstrong, Fred Ehrsam]",Financial Technology and Services,500,S2012
5,Instacart,#06,Instacart is building the best way for people ...,"[Apoorva Mehta, Max Mullen, Brandon Leonardo]",Consumer Goods and Services,600,S2012
6,Machine Zone,#07,MZ builds massive mobile games that break down...,"[Mike Sherrill, Gabriel Leydon, Halbert Nakagawa]",Consumer Media,900,W2008
7,DoorDash,#08,DoorDash connects customers with their favorit...,"[Tony Xu, Stanley Tang, Andy Fang]",Consumer Goods and Services,900,S2013
8,Zenefits,#09,Zenefits provides HR solutions for the new wor...,"[Laks Srini, Parker Conrad]",B2B Software and Services,500,W2013
9,Gusto,#10,"Gusto automates and simplifies payroll, benefi...","[Josh Reeves, Tomer London, Edward Kim]",B2B Software and Services,700,W2012


In [263]:
for founder_list in tableDF['founders']:
    print(founder_list)

['Brian Chesky', 'Joe Gebbia', 'Nathan Blecharczyk']
['John Collison', 'Patrick Collison']
['Kyle Vogt', 'Daniel Kan']
['Arash Ferdowsi', 'Drew Houston']
['Brian Armstrong', 'Fred Ehrsam']
['Apoorva Mehta', 'Max Mullen', 'Brandon Leonardo']
['Mike Sherrill', 'Gabriel Leydon', 'Halbert Nakagawa']
['Tony Xu', 'Stanley Tang', 'Andy Fang']
['Laks Srini', 'Parker Conrad']
['Josh Reeves', 'Tomer London', 'Edward Kim']
['Steve Huffman', 'Alexis Ohanian']
['Baskar Puvanathasan', 'Andrew Miklas', 'Alex Solomon']
['Solomon Hykes', 'Sebastien Pahl']
['Reshma Shetty', 'Jason Kelly', 'Barry Canton', 'Austin Che', 'Tom Knight']
['Felipe Villamarin', 'Simon Borrero', 'Sebastian Mejia']
['Henrique Dubugras', 'Pedro Franceschi']
['Sid Sijbrandij', 'Dmitriy Zaporozhets']
['Emmett Shear', 'Justin Kan', 'Michael Seibel', 'Kyle Vogt']
['Ryan Petersen']
['Suhail Doshi', 'Tim Trefren']
['Jonathan Perichon', 'Daniel Yanisse']
['Pete Koomen', 'Dan Siroker']
['Jacob Rosenberg', 'Sasha Orloff']
['Calvin French-O

# Process Method

In [264]:
headers = table.find_all('th')

In [265]:
for header in headers:
    print(header.text.strip().lower())

name
rank
description
founders
sector
jobs created
batch


In [266]:
data = table.find_all('tr')

In [267]:
for row in data[1:]:
    row_data = row.find_all('td')
    keys_list = list(process.keys())
    for index, (item) in enumerate(row_data):
        key = keys_list[index]
        print(process[key](item))
        if index == 6:
            print()

Airbnb
#01
Airbnb is a global travel community that offers end-to-end trips, including where you stay, what you do and the people you meet. Valued at over $30 Billion.
['Brian Chesky', 'Joe Gebbia', 'Nathan Blecharczyk']
Consumer Goods and Services
4000
W2009

Stripe
#02
Stripe builds economic infrastructure for the internet. Businesses of every size—from new startups to public companies—use their software to accept payments and manage their businesses online. Valued at over $20 Billion.
['John Collison', 'Patrick Collison']
Financial Technology and Services
1300
S2010

Cruise
#03
Cruise is building autonomous vehicles to safely connect people to the places, things, and experiences they care about. Acquired by GM in 2016. Valued at over $14B.
['Kyle Vogt', 'Daniel Kan']
Automotive
1000
W2014

Dropbox
#04
Dropbox is a global collaboration platform where content is created, accessed, and shared. Market cap: over $10 Billion.
['Arash Ferdowsi', 'Drew Houston']
B2B Software and Services
18