In [1]:
import requests
import seaborn
import pandas as pd
from bs4 import BeautifulSoup

# YCombinator Top 100 Companies
## Ethnic Profiling of the founders of the Top 100 Companies

### Process
    - Data Extraction
        - Scrape the data from the website and store it.
    - Data Processing
        - Extract all the founders.
    - Data Analysis
        - Profile using names of founders.

In [254]:
"""
    YCombinator Top 100 List of Companies
        Ranked by Valuation
            - arush15june 2019/01/24
"""

URL = 'https://www.ycombinator.com/topcompanies/'

In [255]:
def get_text(URL):
    """
        Get HTML Content from `URL`
    """
    r = requests.get(URL)
    return r.text

In [256]:
def get_soup(text):
    """
        Get Soup from HTML Content
    """
    try:
        soup = BeautifulSoup(text)
        return soup
    except Exception as e:
        return None, e

In [257]:
"""
    lambdas from extracting item values from <td>
"""
process = {
    'name': (lambda item: item.find('b').text),
    'rank': (lambda item: item.find('span').text),
    'description': (lambda item: item.find('p').text),
    'founders': (lambda item: [li.text for li in item.find_all('li')]),
    'sector': (lambda item: item.find('p').text),
    'jobs created': (lambda item: int(item.find('span').text.replace(',', ''))),
    'batch': (lambda item: item.find('span').text),
    
}

def handle_row(row):
    """
        Handle each <tr> of the <table> containing the list.
        :param row: BeautifulSoup <tr> element
        :return dict row_data: dictionary of values from row.
    """
    row_data = {}
    keys = list(process.keys())
    
    for i, (item) in enumerate(row.find_all('td')):
        key = keys[i]
        row_data[key] = process[key](item)
    
    return row_data    

In [258]:
soup = get_soup(get_text(URL))

In [259]:
top_table = soup.find('table')

In [260]:
def process_table(table):
    """
        Extract Headers from <thead>.
        Extract data from the rows in <tr> inside the table.
        :param table: beautifulsoup object of the table containing data.
        :return table_data: dict of lists of data extracted.
    """
    table_headers = table.find_all('th')
    table_data = {}
    for header in table_headers:
            text = header.text.strip().lower()
            table_data[text] = []
            
    data_rows = table.find_all('tr')
    
    for row in data_rows[1:]:
        row_data = handle_row(row)
        for key, val in row_data.items():
            table_data[key].append(val)
    
    return table_data

In [261]:
data = process_table(top_table)

In [262]:
tableDF = pd.DataFrame(data)
tableDF.to_csv('top100.csv', index=False)
tableDF

Unnamed: 0,name,rank,description,founders,sector,jobs created,batch
0,Airbnb,#01,Airbnb is a global travel community that offer...,"[Brian Chesky, Joe Gebbia, Nathan Blecharczyk]",Consumer Goods and Services,4000,W2009
1,Stripe,#02,Stripe builds economic infrastructure for the ...,"[John Collison, Patrick Collison]",Financial Technology and Services,1300,S2010
2,Cruise,#03,Cruise is building autonomous vehicles to safe...,"[Kyle Vogt, Daniel Kan]",Automotive,1000,W2014
3,Dropbox,#04,Dropbox is a global collaboration platform whe...,"[Arash Ferdowsi, Drew Houston]",B2B Software and Services,1858,S2007
4,Coinbase,#05,Coinbase is a digital currency wallet and plat...,"[Brian Armstrong, Fred Ehrsam]",Financial Technology and Services,500,S2012
5,Instacart,#06,Instacart is building the best way for people ...,"[Apoorva Mehta, Max Mullen, Brandon Leonardo]",Consumer Goods and Services,600,S2012
6,Machine Zone,#07,MZ builds massive mobile games that break down...,"[Mike Sherrill, Gabriel Leydon, Halbert Nakagawa]",Consumer Media,900,W2008
7,DoorDash,#08,DoorDash connects customers with their favorit...,"[Tony Xu, Stanley Tang, Andy Fang]",Consumer Goods and Services,900,S2013
8,Zenefits,#09,Zenefits provides HR solutions for the new wor...,"[Laks Srini, Parker Conrad]",B2B Software and Services,500,W2013
9,Gusto,#10,"Gusto automates and simplifies payroll, benefi...","[Josh Reeves, Tomer London, Edward Kim]",B2B Software and Services,700,W2012


In [263]:
for founder_list in tableDF['founders']:
    print(founder_list)

['Brian Chesky', 'Joe Gebbia', 'Nathan Blecharczyk']
['John Collison', 'Patrick Collison']
['Kyle Vogt', 'Daniel Kan']
['Arash Ferdowsi', 'Drew Houston']
['Brian Armstrong', 'Fred Ehrsam']
['Apoorva Mehta', 'Max Mullen', 'Brandon Leonardo']
['Mike Sherrill', 'Gabriel Leydon', 'Halbert Nakagawa']
['Tony Xu', 'Stanley Tang', 'Andy Fang']
['Laks Srini', 'Parker Conrad']
['Josh Reeves', 'Tomer London', 'Edward Kim']
['Steve Huffman', 'Alexis Ohanian']
['Baskar Puvanathasan', 'Andrew Miklas', 'Alex Solomon']
['Solomon Hykes', 'Sebastien Pahl']
['Reshma Shetty', 'Jason Kelly', 'Barry Canton', 'Austin Che', 'Tom Knight']
['Felipe Villamarin', 'Simon Borrero', 'Sebastian Mejia']
['Henrique Dubugras', 'Pedro Franceschi']
['Sid Sijbrandij', 'Dmitriy Zaporozhets']
['Emmett Shear', 'Justin Kan', 'Michael Seibel', 'Kyle Vogt']
['Ryan Petersen']
['Suhail Doshi', 'Tim Trefren']
['Jonathan Perichon', 'Daniel Yanisse']
['Pete Koomen', 'Dan Siroker']
['Jacob Rosenberg', 'Sasha Orloff']
['Calvin French-O

# Process Method

In [264]:
headers = table.find_all('th')

In [265]:
for header in headers:
    print(header.text.strip().lower())

name
rank
description
founders
sector
jobs created
batch


In [266]:
data = table.find_all('tr')

In [3]:
for row in data[1:]:
    row_data = row.find_all('td')
    keys_list = list(process.keys())
    for index, (item) in enumerate(row_data):
        key = keys_list[index]
        print(process[key](item))
        if index == 6:
            print()

NameError: name 'data' is not defined

# Determine Ethnicity

In [23]:
from ethnicolr import pred_wiki_name, pred_fl_reg_ln, pred_fl_reg_name

In [2]:
companyDF = pd.read_csv('top100.csv')

In [3]:
founder_names = {
    'fname': [],
    'lname': []
}
for founders in companyDF['founders']:
    for founder in eval(founders):
        name_parts = founder.split()
        founder_names['fname'].append(name_parts[0])
        founder_names['lname'].append(name_parts[-1])

founderDF = pd.DataFrame(founder_names)

In [35]:
founderDF.to_csv('founder_names.csv')

In [39]:
founderDF

Unnamed: 0,fname,lname,race
0,Brian,Chesky,"GreaterEuropean,British"
1,Joe,Gebbia,"GreaterEuropean,British"
2,Nathan,Blecharczyk,"GreaterEuropean,EastEuropean"
3,John,Collison,"GreaterEuropean,British"
4,Patrick,Collison,"GreaterEuropean,British"
5,Kyle,Vogt,"GreaterEuropean,British"
6,Daniel,Kan,"GreaterEuropean,WestEuropean,Germanic"
7,Arash,Ferdowsi,"GreaterEuropean,Jewish"
8,Drew,Houston,"GreaterEuropean,British"
9,Brian,Armstrong,"GreaterEuropean,British"


In [47]:
wiki_model = pred_wiki_name(founderDF, 'fname', 'lname')

In [49]:
fl_model = pred_fl_reg_name(founderDF, 'fname', 'lname')

In [50]:
wiki_model

Unnamed: 0,fname,lname,race,"Asian,GreaterEastAsian,EastAsian","Asian,GreaterEastAsian,Japanese","Asian,IndianSubContinent","GreaterAfrican,Africans","GreaterAfrican,Muslim","GreaterEuropean,British","GreaterEuropean,EastEuropean","GreaterEuropean,Jewish","GreaterEuropean,WestEuropean,French","GreaterEuropean,WestEuropean,Germanic","GreaterEuropean,WestEuropean,Hispanic","GreaterEuropean,WestEuropean,Italian","GreaterEuropean,WestEuropean,Nordic"
0,Brian,Chesky,"GreaterEuropean,British",0.004353,0.005968,0.003369,0.000825,0.000751,0.556137,0.184008,0.212501,0.005456,0.006024,0.004780,0.011595,0.004232
1,Joe,Gebbia,"GreaterEuropean,British",0.007367,0.012254,0.005926,0.010182,0.000739,0.778087,0.002993,0.038986,0.019354,0.017451,0.011644,0.047135,0.047881
2,Nathan,Blecharczyk,"GreaterEuropean,EastEuropean",0.000101,0.000122,0.000399,0.000095,0.009863,0.004896,0.579968,0.401214,0.000419,0.000658,0.001023,0.001191,0.000050
3,John,Collison,"GreaterEuropean,British",0.022823,0.007290,0.055444,0.048662,0.003820,0.565648,0.003762,0.028271,0.019975,0.013617,0.176884,0.044561,0.009243
4,Patrick,Collison,"GreaterEuropean,British",0.013453,0.003557,0.014278,0.015327,0.001577,0.850341,0.000959,0.015842,0.004439,0.001938,0.059774,0.017280,0.001235
5,Kyle,Vogt,"GreaterEuropean,British",0.018964,0.048704,0.013872,0.006448,0.002542,0.552326,0.010666,0.039674,0.011320,0.022268,0.076104,0.081112,0.116001
6,Daniel,Kan,"GreaterEuropean,WestEuropean,Germanic",0.024678,0.018439,0.056703,0.012558,0.009394,0.076942,0.096715,0.176379,0.062767,0.291903,0.021017,0.023620,0.128885
7,Arash,Ferdowsi,"GreaterEuropean,Jewish",0.000298,0.000674,0.019562,0.016381,0.285941,0.101463,0.054785,0.378196,0.011948,0.036638,0.046106,0.045801,0.002206
8,Drew,Houston,"GreaterEuropean,British",0.020926,0.004714,0.015896,0.024752,0.001221,0.734294,0.002724,0.045032,0.075856,0.010516,0.043441,0.009118,0.011510
9,Brian,Armstrong,"GreaterEuropean,British",0.095452,0.050453,0.026185,0.008445,0.001744,0.377077,0.004273,0.103926,0.169334,0.029893,0.046213,0.027370,0.059634


In [51]:
fl_model

Unnamed: 0,fname,lname,race,asian,hispanic,nh_black,nh_white
0,Brian,Chesky,nh_white,0.002622,0.005705,0.248811,0.742862
1,Joe,Gebbia,nh_black,0.004761,0.009486,0.690917,0.294836
2,Nathan,Blecharczyk,nh_white,0.027819,0.107694,0.041242,0.823245
3,John,Collison,nh_white,0.005410,0.007686,0.191275,0.795628
4,Patrick,Collison,nh_white,0.003557,0.006833,0.237164,0.752446
5,Kyle,Vogt,nh_white,0.064106,0.030373,0.282492,0.623029
6,Daniel,Kan,nh_white,0.027278,0.024320,0.083423,0.864980
7,Arash,Ferdowsi,nh_black,0.040107,0.094474,0.481797,0.383621
8,Drew,Houston,nh_white,0.016522,0.026283,0.060810,0.896386
9,Brian,Armstrong,nh_black,0.152414,0.011188,0.615115,0.221283


# OSINT for Ethnicity

In [4]:
import wikipedia

In [5]:
name = '{} {}'.format(founder_names['fname'][0], founder_names['lname'][0])
name

'Brian Chesky'

In [6]:
page = wikipedia.page(wikipedia.search(name)[0])

In [7]:
soup = BeautifulSoup(page.html())

In [8]:
print(soup.prettify())

<div class="mw-parser-output">
 <table class="infobox biography vcard" style="width:22em">
  <tbody>
   <tr>
    <th colspan="2" style="text-align:center;font-size:125%;font-weight:bold">
     <div class="fn" style="display:inline">
      Brian Chesky
     </div>
    </th>
   </tr>
   <tr>
    <td colspan="2" style="text-align:center">
     <a class="image" href="/wiki/File:Brian_Chesky,_2016_(cropped).jpg">
      <img alt="Brian Chesky, 2016 (cropped).jpg" data-file-height="818" data-file-width="578" decoding="async" height="311" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/06/Brian_Chesky%2C_2016_%28cropped%29.jpg/220px-Brian_Chesky%2C_2016_%28cropped%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/06/Brian_Chesky%2C_2016_%28cropped%29.jpg/330px-Brian_Chesky%2C_2016_%28cropped%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/06/Brian_Chesky%2C_2016_%28cropped%29.jpg/440px-Brian_Chesky%2C_2016_%28cropped%29.jpg 2x" width="220"/>
     </a>
     <d

In [9]:
birthplace_div = soup.select('.birthplace')[0]

In [10]:
birthplace_div.text

'Niskayuna, New York, U.S.'

In [11]:
for fname, lname in zip(founder_names['fname'], founder_names['lname']):
    name = '{} {}'.format(fname, lname)
    print(name)
    suggestion = wikipedia.search(name)
    print(suggestion)
    try:
        page = wikipedia.page(suggestion[0], auto_suggest=True)
        print(page)
    except:
        print("failed getting page")
        continue
    try:
        soup = BeautifulSoup(page.html(), "html.parser")
        birthplace_div = soup.select('.birthplace')[0]
        print(soup.select('.birthplace'))
        print(birthplace_div.text)
        print()
    except:
        print('couldnt find birthplace')
        print()

Brian Chesky
['Brian Chesky', 'Joe Gebbia', 'Airbnb', 'Timeline of Airbnb', 'Nathan Blecharczyk', 'Andrew Yang (entrepreneur)', 'Sam Altman', '42 (school)', 'Belinda Johnson', 'Chesky']
<WikipediaPage 'Brian Chesky'>
[<div class="birthplace" style="display:inline"><a href="/wiki/Niskayuna,_New_York" title="Niskayuna, New York">Niskayuna, New York</a>, U.S.</div>]
Niskayuna, New York, U.S.

Joe Gebbia
['Joe Gebbia', 'Airbnb', 'The Real Housewives of Beverly Hills', 'Nathan Blecharczyk', 'Gebbia', 'Brian Chesky', 'Timeline of Airbnb', 'Unicorn (finance)', 'Hospitality service', 'How I Built This']
<WikipediaPage 'Joe Gebbia'>
[<div class="birthplace" style="display:inline"><a class="mw-redirect" href="/wiki/Atlanta,_Georgia" title="Atlanta, Georgia">Atlanta, Georgia</a>, U.S.</div>]
Atlanta, Georgia, U.S.

Nathan Blecharczyk
['Nathan Blecharczyk', 'Airbnb', 'Joe Gebbia', 'Brian Chesky', 'Timeline of Airbnb', 'Unicorn (finance)', 'The Imitation Game', 'Caroline Hyde', 'List of Hillary Cli



  lis = BeautifulSoup(html).find_all('li')


failed getting page
Fred Ehrsam
['Fred Ehrsam', 'Coinbase', 'Duke University', 'List of Internet entrepreneurs', 'List of Sigma Alpha Epsilon members', 'List of Duke University people', 'Concord-Carlisle High School', 'Alpha Kappa Psi', 'Zora G. Clevenger', 'Vince Gibson']
<WikipediaPage 'Fred Ehrsam'>
couldnt find birthplace

Apoorva Mehta
['Student of the Year 2', 'Dhadak', 'Simmba', 'Raazi', 'Brahmastra (film)', 'Kesari (film)', 'Instacart', 'List of Dharma Productions films', 'Calling Sehmat', 'Kapoor & Sons']
<WikipediaPage 'Student of the Year 2'>
couldnt find birthplace

Max Mullen
['Instacart', 'Jungleland (film)', 'Thomas Mullen (author)', 'Mission: Impossible (film)', 'Dustin Milligan', 'Mission: Impossible (film series)', 'The Last Town on Earth', 'Flying Home (film)', 'MaxDiff', 'Laura Mullen']
<WikipediaPage 'Instacart'>
couldnt find birthplace

Brandon Leonardo
['Leonardo DiCaprio filmography', 'Instacart', 'The Departed', 'Shutter Island (film)', 'Ryan McPartlin', 'Joker

KeyboardInterrupt: 