In [1]:
#Packages to install

# pretty printer
import pprint

# set up the pretty printer
pp = pprint.PrettyPrinter(indent=4)

# BeautifulSoup for scraping
from bs4 import BeautifulSoup

# for making HTTP requests
import requests

# Pandas/numpy for data manipulation
import pandas as pd
import numpy as np

In [2]:
# The URL for the Wikipedia page we're scraping
WIKI_URL = 'https://en.wikipedia.org/wiki/S%26P_100'

# Retrieve the page
wiki_page = requests.get(WIKI_URL).text

In [14]:
# parse the HTML text into a tree
soup = BeautifulSoup(wiki_page, 'html.parser')

# print the tree to screen
#print(soup.prettify())

In [15]:
# extract the table containin the S&P companies
sandp_table = soup.find('table', {"class" : "wikitable sortable"})

In [16]:
# snps array will hold an array of tuples of the form (Symbol, Name)
snps = []

# scan the table for each row ('tr' is the HTML tag for a table row)
for row in sandp_table.find_all('tr'):
    
    # scan the row for table cells ('td' is the tag for table data)
    cols = row.find_all('td')
    
    if len(cols) == 2: # skip the header row
        snps.append((cols[0].text.strip(), cols[1].text.strip()))

# convert the array of tuples into a Pandas DataFrame        
snps_df = pd.DataFrame(snps, columns=['Symbol', 'Name'])

snps_df.head()

Unnamed: 0,Symbol,Name
0,AAPL,Apple Inc.
1,ABBV,AbbVie Inc.
2,ABT,Abbott Laboratories
3,ACN,Accenture plc
4,AGN,Allergan plc


In [75]:
REUTERS_BASE_URL = 'http://www.reuters.com/finance/stocks/company-officers/'

symbol_array = snps_df['Symbol'].values

# board_members will hold an array of tuples, one for each board member
board_members = []

# for simplicity only look at the first five companies in class
for (index, co) in snps_df.iterrows():
    sym = co['Symbol']
    reuters_page = requests.get(REUTERS_BASE_URL+sym).text
    soup = BeautifulSoup(reuters_page, 'html.parser')
    company_news_table = soup.find(id="companyNews").find("tbody",{"class" : "dataSmall"})
    for row in company_news_table.find_all('tr'):
        cols = row.find_all('td')
        if(len(cols)==4):
            board_members.append((sym, cols[0].text.strip(),cols[0].find('a').get('href'), cols[1].text.strip(), cols[2].text.strip()))
    
df = pd.DataFrame(board_members, columns=['Symbol', 'Name','Link','Age', 'Year'])
df.head()


Unnamed: 0,Symbol,Name,Link,Age,Year
0,AAPL,Art Levinson,/finance/stocks/officer-profile/AAPL.O/156560,67,2011
1,AAPL,Timothy Cook,/finance/stocks/officer-profile/AAPL.O/88090,57,2011
2,AAPL,Luca Maestri,/finance/stocks/officer-profile/AAPL.O/2486890,54,2014
3,AAPL,Katherine Adams,/finance/stocks/officer-profile/AAPL.O/2871597,53,2017
4,AAPL,Phil Schiller,/finance/stocks/officer-profile/AAPL.O/346990,57,2002


In [66]:
df.set_index(keys=['Symbol','Name'],inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Link,Age,Year
Symbol,Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,Art Levinson,/finance/stocks/officer-profile/AAPL.O/156560,67,2011
AAPL,Timothy Cook,/finance/stocks/officer-profile/AAPL.O/88090,57,2011
AAPL,Luca Maestri,/finance/stocks/officer-profile/AAPL.O/2486890,54,2014
AAPL,Katherine Adams,/finance/stocks/officer-profile/AAPL.O/2871597,53,2017
AAPL,Phil Schiller,/finance/stocks/officer-profile/AAPL.O/346990,57,2002
AAPL,Angela Ahrendts,/finance/stocks/officer-profile/AAPL.O/2581904,57,2014
AAPL,Eddy Cue,/finance/stocks/officer-profile/AAPL.O/1677766,54,2011
AAPL,Craig Federighi,/finance/stocks/officer-profile/AAPL.O/1768087,48,2012
AAPL,Daniel Riccio,/finance/stocks/officer-profile/AAPL.O/1747873,55,2012
AAPL,Johny Srouji,/finance/stocks/officer-profile/AAPL.O/2682545,53,2015


In [67]:
df.loc['GOOG']

Unnamed: 0_level_0,Link,Age,Year
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
John Hennessy,/finance/stocks/officer-profile/GOOG.O/480659,65,2018
Sergey Brin,/finance/stocks/officer-profile/GOOG.O/480647,44,2015
Lawrence Page,/finance/stocks/officer-profile/GOOG.O/480651,45,2015
Ruth Porat,/finance/stocks/officer-profile/GOOG.O/2594505,60,2015
Sundar Pichai,/finance/stocks/officer-profile/GOOG.O/2657229,46,2017
David Drummond,/finance/stocks/officer-profile/GOOG.O/480654,55,2015
Diane Greene,/finance/stocks/officer-profile/GOOG.O/1678945,62,2015
Eric Schmidt,/finance/stocks/officer-profile/GOOG.O/480644,62,2018
L. John Doerr,/finance/stocks/officer-profile/GOOG.O/480658,66,2016
Roger Ferguson,/finance/stocks/officer-profile/GOOG.O/2735140,66,2016


In [77]:
df['Age'] = df['Age'].apply(pd.to_numeric, errors='coerce')

df.groupby('Symbol').mean()

Unnamed: 0_level_0,Age
Symbol,Unnamed: 1_level_1
AAPL,59.000000
ABBV,57.578947
ABT,59.600000
ACN,56.416667
AGN,57.470588
AIG,59.708333
ALL,59.291667
AMGN,60.750000
AMZN,59.066667
AXP,59.160000


In [91]:
new_df = df.groupby('Symbol').mean()
new_df.loc[new_df.min()]
#print(new_df.loc[new_df.max()])

KeyError: 'None of [Age    49.526316\ndtype: float64] are in the [index]'