# S&P 500 companies and their historical price

In [None]:
!pip install yfinance

In [3]:
import bs4 as bs
import requests
import yfinance as yf
import datetime
import pandas as pd
import pdb
import time
import numpy as np


def get_company_price():
  #set to the most recent quarter
  start = datetime.datetime(2020,1,1)
  end = datetime.datetime(2020,3,31)
  resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
  soup = bs.BeautifulSoup(resp.text, 'lxml')
  table = soup.find('table', {'class': 'wikitable sortable'})
  tickers = []
  for row in table.findAll('tr')[1:]:
      ticker = row.findAll('td')[0].text.replace('\n', '').replace('.','-')
      tickers.append(ticker)
  data = yf.download(tickers, start=start, end=end,group_by='ticker',auto_adjust=False)
  return data

def get_company_performance():
  #set to the most recent quarter
  df_OHLC = get_company_price()
  col_drop =[i for i in df_OHLC.columns if i[1]!='Close']
  df_close = df_OHLC.drop(columns=col_drop)
  df_close.columns = [i[0] for i in df_close.columns]
  df_close = df_close.fillna(method = 'bfill').fillna(method = 'ffill')
  df_close = df_close.transpose()
  df_close['performance'] = (df_close[df_close.columns[-1]] - df_close[df_close.columns[0]])/df_close[df_close.columns[0]]
  df_performance = df_close.iloc[:,-1:].reset_index()
  df_performance.columns =['code','performance']
  return df_performance

calculate their performance

In [None]:
df_performance = get_company_performance()
df_performance.head()

[*********************100%***********************]  505 of 505 completed


Unnamed: 0,code,performance
0,HBI,-0.446808
1,DOW,-0.466977
2,PBCT,-0.313246
3,HSY,-0.042929
4,TIF,-0.040045


# S&P 500 companies and their sector

In [4]:
def get_company_sector():
  tickers_sector = []
  resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
  soup = bs.BeautifulSoup(resp.text, 'lxml')
  table = soup.find('table', {'class': 'wikitable sortable'})
  for row in table.findAll('tr')[1:]:
      ticker1 = row.findAll('td')[0].text.replace('\n', '').replace('.','-')
      sector = row.findAll('td')[3].text
      tickers_sector.append((ticker1,sector))
  df = pd.DataFrame(tickers_sector)
  df.columns = ['code','sector']
  return df


In [34]:
df_sector = get_company_sector()
df_sector.head()

Unnamed: 0,code,sector
0,MMM,Industrials
1,ABT,Health Care
2,ABBV,Health Care
3,ABMD,Health Care
4,ACN,Information Technology


# TOP 5 company in each sector based on market cap

In [192]:
def get_quaterly_market_cap(code):
  page = 'https://finance.yahoo.com/quote/'+code+'/key-statistics?p='+code
  resp = requests.get(page)
  soup = bs.BeautifulSoup(resp.text, 'lxml')
  table = soup.find('tbody')
  l = []
  for i in table.findAll('tr')[0].findAll('td')[1:]:
    if i=='N/A':
      l.append(np.nan)
    else:  
      l.append(int(i.text.replace('.','').replace('T','0000000000').replace('B','0000000')))
  return l

In [175]:
def get_top5_by_cap(sector,quater):
  #quater represent which quater's data we are looking at,
  #It could be 0-5,0 represent current market cap data. 1-5 represent the Nth recent quater's data
  #For example, when quater = 2,we are ordering the company based on the second recent qauter market cap
  df = get_company_sector()        
  df = df[df['sector']==sector]    
  df['market_cap'] = df['code'].apply(get_quaterly_market_cap)
  df = pd.DataFrame(df.market_cap.values.tolist(), df.code).add_prefix('quater_')
  df = df.sort_values(by='quater_'+str(quater), ascending=False)
  return df.index[:6]

In [None]:
get_top5_by_cap('Industrials',0)

Index(['UNP', 'BA', 'LMT', 'HON', 'RTX', 'UPS'], dtype='object', name='code')

In [None]:
get_top5_by_cap('Consumer Discretionary',0)

Index(['AMZN', 'HD', 'NKE', 'MCD', 'LOW', 'SBUX'], dtype='object', name='code')

In [179]:
get_top5_by_cap('Information Technology',0)

Index(['AAPL', 'MSFT', 'V', 'MA', 'INTC', 'NVDA'], dtype='object', name='code')

In [180]:
get_top5_by_cap('Financials',0)

Index(['BRK-B', 'JPM', 'BAC', 'WFC', 'C', 'AXP'], dtype='object', name='code')

In [181]:
get_top5_by_cap('Health Care',0)

Index(['JNJ', 'UNH', 'MRK', 'PFE', 'ABBV', 'ABT'], dtype='object', name='code')

In [182]:
get_top5_by_cap('Consumer Staples',0)

Index(['WMT', 'PG', 'KO', 'PEP', 'COST', 'PM'], dtype='object', name='code')

In [183]:
get_top5_by_cap('Real Estate',0)

Index(['AMT', 'PLD', 'CCI', 'EQIX', 'DLR', 'PSA'], dtype='object', name='code')

In [184]:
get_top5_by_cap('Utilities',0)

Index(['NEE', 'D', 'DUK', 'SO', 'AEP', 'EXC'], dtype='object', name='code')

In [185]:
get_top5_by_cap('Materials',0)

Index(['ECL', 'APD', 'DD', 'DOW', 'BLL', 'CTVA'], dtype='object', name='code')

In [186]:
get_top5_by_cap('Energy',0)

Index(['APA', 'MRO', 'DVN', 'NBL', 'HAL', 'HES'], dtype='object', name='code')

In [193]:
get_top5_by_cap('Communication Services',0)

Index(['GOOG', 'GOOGL', 'FB', 'T', 'DIS', 'NFLX'], dtype='object', name='code')

# Find most important metrics for each sector and ranking

In [167]:
def get_metrics(code,q):
  page = 'https://finance.yahoo.com/quote/'+code+'/key-statistics?p='+code
  resp = requests.get(page)
  soup = bs.BeautifulSoup(resp.text, 'lxml')
  table = soup.find('tbody')
  l = []
  for i in table.findAll('tr')[2:]:
    l.append(i.findAll('td')[q+1].text)
  return l

def get_performance_metrics(df_sub,sector,q):
  #df_performance = get_company_performance()
  #df_sector = get_company_sector()
  #df_sub = pd.merge(df_performance,df_sector,on='code')

  df_sub = df_sub[df_sub['sector'] == sector]
  df_sub['metrics'] = df_sub['code'].apply(get_metrics,args=(q,))
  df_sub[['Trailing P/E','Forward P/E','PEG Ratio','Price/Sales (ttm)','Price/Book (mrq)','Enterprise Value/Revenue','Enterprise Value/EBITDA']]= pd.DataFrame(df_sub.metrics.values.tolist(),df_sub.index)
  df_sub.drop(columns=['metrics'],inplace=True)
  return df_sub


def analyze_df(df):
  top20 = df['performance'].quantile(0.8)
  buttom20 = df['performance'].quantile(0.2)
  t = df[df['performance']>top20]\
      .iloc[:,3:]\
      .replace(['2.50k','-1.09k','N/A','-2.58k','1.08k'],['2500','-1090',np.nan,'-2580','1080'])\
      .astype('float')\
      .mean(axis = 0,skipna = 'True')
  b = df[df['performance']<buttom20]\
      .iloc[:,3:]\
      .replace(['2.50k','-1.09k','N/A','-2.58k','1.08k'],['2500','-1090',np.nan,'-2580','1080'])\
      .astype('float')\
      .mean(axis = 0,skipna = 'True')
  avg = df.iloc[:,3:]\
      .replace(['2.50k','-1.09k','N/A','-2.58k','1.08k'],['2500','-1090',np.nan,'-2580','1080'])\
      .astype('float')\
      .mean(axis = 0,skipna = 'True')

  t = pd.DataFrame(t)
  t.columns = ['top_20_percent']
  b = pd.DataFrame(b)
  b.columns = ['bottom_20_percent']
  avg = pd.DataFrame(avg)
  avg.columns = ['average']

  res = t.join(avg).join(b)
  res['top and avg diff'] = (res['top_20_percent']-res['average'])/(res['average'])
  res['top and bottom diff'] = (res['top_20_percent']-res['bottom_20_percent'])/(res['bottom_20_percent'])
  return res

def get_top_5(df,metrics,asc=False):
  return df.sort_values(by=metrics, ascending=asc)['code'].head(5).reset_index()['code']


In [10]:
  df_performance = get_company_performance()
  df_sector = get_company_sector()
  
  df_all = pd.merge(df_performance,df_sector,on='code')

[*********************100%***********************]  505 of 505 completed


## Industrials

In [None]:
df = get_performance_metrics(df_all,'Industrials',1)

In [147]:
df.columns

Index(['code', 'performance', 'sector', 'Trailing P/E', 'Forward P/E',
       'PEG Ratio', 'Price/Sales (ttm)', 'Price/Book (mrq)',
       'Enterprise Value/Revenue', 'Enterprise Value/EBITDA'],
      dtype='object')

In [124]:
analyze_df(df)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,25.217143,24.258235,11.672143,0.039529,1.160455
Forward P/E,20.554615,15.282857,11.306667,0.344946,0.81792
PEG Ratio,2.117,12.649821,68.017778,-0.832646,-0.968876
Price/Sales (ttm),3.022667,2.1175,0.954667,0.42747,2.166201
Price/Book (mrq),7.206,5.080882,1.906667,0.418258,2.779371
Enterprise Value/Revenue,13.132667,10.524861,6.552,0.247776,1.004375
Enterprise Value/EBITDA,52.836,34.394306,13.972667,0.536185,2.781383


In [152]:
get_top_5(df,'Enterprise Value/EBITDA')

0      TT
1     LUV
2      GE
3     LHX
4    CPRT
Name: code, dtype: object

## IT

In [None]:
df_it = get_performance_metrics(df_all,'Information Technology',1)

In [125]:
analyze_df(df_it)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,47.03,49.207536,18.42,-0.044252,1.553203
Forward P/E,31.333571,20.948873,11.177857,0.495716,1.803182
PEG Ratio,2.700769,2.102222,1.352,0.284721,0.99761
Price/Sales (ttm),9.341429,5.437324,1.882857,0.71802,3.961305
Price/Book (mrq),12.537692,8.104091,5.719286,0.547082,1.192178
Enterprise Value/Revenue,33.522143,21.420563,9.536429,0.564952,2.515167
Enterprise Value/EBITDA,124.037857,81.839577,37.170714,0.515622,2.336978


In [153]:
get_top_5(df_it,'Price/Sales (ttm)')

0    INTU
1    MSFT
2    FTNT
3     CRM
4    CDNS
Name: code, dtype: object

## Financials

In [None]:
df_f = get_performance_metrics(df_all,'Financials',1)

In [127]:
analyze_df(df_f)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,26.082308,11.979848,5.013077,1.177182,4.202854
Forward P/E,22.020769,10.992923,4.531538,1.003177,3.859447
PEG Ratio,2.276364,1.854222,1.307143,0.227665,0.74148
Price/Sales (ttm),7.397692,2.794848,1.001538,1.646903,6.386329
Price/Book (mrq),19.589167,4.493846,0.510769,3.359109,37.352284
Enterprise Value/Revenue,26.036923,7.771389,4.3275,2.350356,5.01662
Enterprise Value/EBITDA,63.401818,55.689375,57.82,0.13849,0.096538


In [159]:
get_top_5(df_f,'Price/Book (mrq)')

0    MSCI
1     MCO
2     MMC
3     PGR
4    TROW
Name: code, dtype: object

## Consumer Discretionary

In [None]:
df_cd = get_performance_metrics(df_all,'Consumer Discretionary',1)

In [128]:
analyze_df(df_cd)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,27.173077,25.064677,15.126923,0.084118,0.796339
Forward P/E,24.258462,16.54371,17.249231,0.466325,0.40635
PEG Ratio,2.331538,1.763889,1.831,0.321817,0.273369
Price/Sales (ttm),2.648462,1.42,0.547692,0.865114,3.835674
Price/Book (mrq),7.583,5.92963,1.358462,0.278832,4.58205
Enterprise Value/Revenue,14.180769,9.187302,6.659231,0.543518,1.129491
Enterprise Value/EBITDA,-28.557692,-29.699841,-39.344615,-0.038456,-0.274165


In [160]:
get_top_5(df_cd,'Price/Book (mrq)')

0     DPZ
1     YUM
2    SBUX
3     MCD
4      LB
Name: code, dtype: object

## Health Care

In [None]:
df_hc = get_performance_metrics(df_all,'Health Care',1)

In [129]:
analyze_df(df_hc)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,51.933,42.049153,75.950833,0.235055,-0.316229
Forward P/E,29.5375,20.312167,14.44,0.454178,1.045533
PEG Ratio,4.49,3.930926,1.89625,0.142225,1.367831
Price/Sales (ttm),5.690833,4.318361,2.531667,0.317823,1.24786
Price/Book (mrq),12.235833,8.577414,3.443636,0.426518,2.553172
Enterprise Value/Revenue,20.466667,18.159836,12.886667,0.127029,0.588205
Enterprise Value/EBITDA,90.594167,66.471034,32.307273,0.362912,1.804142


In [161]:
get_top_5(df_hc,'Price/Book (mrq)')

0     HCA
1     WAT
2    ABBV
3     RMD
4      EW
Name: code, dtype: object

## Consumer Staples

In [None]:
df_cs = get_performance_metrics(df_all,'Consumer Staples',1)

In [130]:
analyze_df(df_cs)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,23.168571,24.427419,19.014,-0.051534,0.218501
Forward P/E,21.104286,18.429091,11.335714,0.145162,0.861752
PEG Ratio,7.208571,5.784545,12.582857,0.246178,-0.427112
Price/Sales (ttm),2.068571,2.622121,1.334286,-0.211108,0.550321
Price/Book (mrq),77.89,24.190645,12.64,2.21984,5.162184
Enterprise Value/Revenue,9.162857,12.906667,8.367143,-0.290068,0.0951
Enterprise Value/EBITDA,57.448571,61.955455,32.054286,-0.072744,0.792227


In [163]:
get_top_5(df_cs,'Price/Book (mrq)')

0     KMB
1      PM
2     SYY
3    COST
4       K
Name: code, dtype: object

## Real Estate

In [None]:
df_re = get_performance_metrics(df_all,'Real Estate',1)

In [131]:
analyze_df(df_re)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,88.96,44.636,11.76,0.99301,6.564626
Forward P/E,73.418333,131.151111,27.04,-0.4402,1.715175
PEG Ratio,2.512,7.121818,2.2075,-0.647281,0.137939
Price/Sales (ttm),11.571667,7.248387,2.926667,0.596447,2.953872
Price/Book (mrq),7.394,3.29,1.988333,1.247416,2.718692
Enterprise Value/Revenue,61.543333,41.84129,26.055,0.470876,1.362055
Enterprise Value/EBITDA,168.678333,89.204194,73.441667,0.890924,1.296766


In [164]:
get_top_5(df_re,'Trailing P/E')

0     WY
1    CCI
2    HST
3    SPG
4    DLR
Name: code, dtype: object

## Utilities

In [None]:
df_u = get_performance_metrics(df_all,'Utilities',1)

In [132]:
analyze_df(df_u)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,27.44,20.506786,13.67,0.338094,1.007315
Forward P/E,24.278333,17.281429,10.471667,0.40488,1.318478
PEG Ratio,3.185,3.434815,2.436,-0.07273,0.307471
Price/Sales (ttm),4.038333,2.555,1.393333,0.580561,1.898325
Price/Book (mrq),2.853333,2.143929,2.186667,0.33089,0.304878
Enterprise Value/Revenue,23.948333,17.594643,11.901667,0.361115,1.012183
Enterprise Value/EBITDA,56.02,47.516429,24.145,0.178961,1.320149


In [165]:
get_top_5(df_u,'Price/Sales (ttm)')

0    NEE
1    AWK
2    ATO
3    WEC
4      D
Name: code, dtype: object

## Materials

In [None]:
df_m = get_performance_metrics(df_all,'Materials',1)

In [133]:
analyze_df(df_m)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,26.02,18.791304,8.685,0.384683,1.99597
Forward P/E,21.371667,15.392143,15.863333,0.388479,0.347237
PEG Ratio,2.3,2.164286,1.446667,0.062706,0.589862
Price/Sales (ttm),3.0,1.636429,0.775,0.833261,2.870968
Price/Book (mrq),4.953333,2.746296,1.296667,0.803641,2.820051
Enterprise Value/Revenue,13.968333,8.972143,6.126667,0.556856,1.279924
Enterprise Value/EBITDA,64.626667,36.3575,-15.251667,0.777533,-5.237351


In [166]:
get_top_5(df_m,'Enterprise Value/EBITDA'	)

0    MLM
1    BLL
2    VMC
3    SHW
4    ECL
Name: code, dtype: object

## Energy

In [None]:
df_e = get_performance_metrics(df_all,'Energy',1)

In [134]:
analyze_df(df_e)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,20.666,14.654375,11.7,0.410227,0.766325
Forward P/E,24.27,19.529048,43.903333,0.242764,-0.447195
PEG Ratio,6.156,2.88375,1.71,1.13472,2.6
Price/Sales (ttm),1.912,0.895769,0.594,1.134478,2.218855
Price/Book (mrq),1.436,0.814615,0.36,0.762795,2.988889
Enterprise Value/Revenue,14.158,7.306154,7.256,0.937818,0.951213
Enterprise Value/EBITDA,-124.098,20.316154,13.72,-7.108341,-10.045044


In [172]:
get_top_5(df_e,'Enterprise Value/EBITDA',asc = True)

0    BKR
1    FTI
2    CXO
3    VLO
4    PSX
Name: code, dtype: object

## Communication Services

In [None]:
df_cser = get_performance_metrics(df_all,'Communication Services',1)

In [135]:
analyze_df(df_cser)

Unnamed: 0,top_20_percent,average,bottom_20_percent,top and avg diff,top and bottom diff
Trailing P/E,48.186,20.975238,5.473333,1.29728,7.803776
Forward P/E,32.758,18.2125,12.102,0.798655,1.706825
PEG Ratio,1.808,15.693478,1.654,-0.884793,0.093108
Price/Sales (ttm),4.794,2.66,0.67,0.802256,6.155224
Price/Book (mrq),7.266,3.3224,0.726,1.186973,9.008264
Enterprise Value/Revenue,18.49,11.57,5.132,0.598099,2.602884
Enterprise Value/EBITDA,45.788,33.3252,18.664,0.373975,1.453279


In [173]:
get_top_5(df_cser,'Price/Book (mrq)')

0     LYV
1    TTWO
2      FB
3     OMC
4      EA
Name: code, dtype: object