# Data Wrangling - MarketWatch Company Fundmental Data Using Beautiful Soup

Note: this workbook is 1 of 4 scraping and extracting processes that ultimately aggregate into the Technical Indicators workbook in the Data Wrangling phase of capstone 3


* <span style="color:red"> **Fundamental Scraper (this book)** </span> - scrapes 5 years worth of fundamental company financial data from MarketWatch using Beautiful Soup from the S&P 500 list
* Fundamental Calcs - imports scraped data from the scraper tool, converts text data to numeric - i.e. 5.00M to 5000000 - using regular expressions, and calculates additonal financial metrics
* Analyst Scraper - scrapes analyst buy, sell, hold ratings for all S&P 500 stocks and downloads to .csv file
* Mass Yahoo Download and Technical Analysis - downloads 5 years of daily stock pricing data from the S&P 500, Runs complex Directional Index, ADX, Bollinger Band, and other financial charting data. Merges data from fundamental and analyst scrapers

In [18]:
#define file paths
import os
importpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\Stock Import Lists'
filename = 'SandP.csv'
exportname = 'Fundamentals'
os.chdir(importpath)
print(os.getcwd())

C:\Users\nmur1\Google Drive\Springboard\Capstone2\Stock Import Lists


In [19]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import date
import re

#got started with code from medium ariticle: https://link.medium.com/siL0vVKhp7

#Get List of S&P


companies = pd.read_csv(filename, encoding= 'unicode_escape')
Symbol = companies.Symbol


def getfinancialreportingdf(ticker, timeframe = 'A'):


    
    # try:
    if timeframe == 'Q':
        urlfinancials = 'https://www.marketwatch.com/investing/stock/'+ticker+'/financials/income/quarter'
        urlbalancesheet = 'https://www.marketwatch.com/investing/stock/'+ticker+'/financials/balance-sheet/quarter'
        tfIndex = ['MRQ-4', 'MRQ-3', 'MRQ-2', 'MRQ-1', 'MRQ']
    else:
        urlfinancials = 'https://www.marketwatch.com/investing/stock/'+ticker+'/financials'
        urlbalancesheet = 'https://www.marketwatch.com/investing/stock/'+ticker+'/financials/balance-sheet'
        tfIndex = range(date.today().year-5,date.today().year)
    text_soup_financials = BeautifulSoup(requests.get(urlfinancials).text,"lxml") #read in
    text_soup_balancesheet = BeautifulSoup(requests.get(urlbalancesheet).text,"lxml") #read in


    # Income statement
    titlesfinancials = text_soup_financials.findAll('td', {'class': 'rowTitle'})
    epslist=[]
    netincomelist = []
    longtermdebtlist = [] 
    stdebtlist = []
    interestexpenselist = []
    ebitdalist= []
    cashlist = []
    saleslist = []
    shareslist = []
    
    for title in titlesfinancials:
        if 'EPS (Basic)' in title.text:
            epslist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'Net Income' in title.text:
            netincomelist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'Interest Expense' in title.text:
            interestexpenselist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'EBITDA' in title.text:
            ebitdalist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'Sales' in title.text:
            saleslist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'Basic Shares Outstanding' in title.text:
            shareslist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
            
            
    # Balance sheet
    titlesbalancesheet = text_soup_balancesheet.findAll('td', {'class': 'rowTitle'})
    equitylist=[]
    for title in titlesbalancesheet:
        if 'Total Shareholders\' Equity' in title.text:
            equitylist.append( [td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'Long-Term Debt' in title.text:
            longtermdebtlist.append( [td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
        if 'ST Debt & Current Portion LT Debt' in title.text:
            stdebtlist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])

        if 'Cash' in title.text:
            cashlist.append ([td.text for td in title.findNextSiblings(attrs={'class': 'valueCell'}) if td.text])
            
        
    eps = getelementinlist(epslist,0)
    epsgrowth = getelementinlist(epslist,1)
    netincome = getelementinlist(netincomelist,0)
    shareholderequity = getelementinlist(equitylist,0)
    roa = getelementinlist(equitylist,1)
    cash = getelementinlist(cashlist,0)
    sales = getelementinlist(saleslist,0)
    shares = getelementinlist(shareslist,0)
    
    longtermdebt = getelementinlist(longtermdebtlist,0)
    stdebt = getelementinlist(stdebtlist,0)
    interestexpense =  getelementinlist(interestexpenselist,0)
    ebitda = getelementinlist(ebitdalist,0)
    # Don't forget to add in roe, interest coverage ratio

    ## Make it into Dataframes

    df= pd.DataFrame({'eps': eps,'epsgrowth': epsgrowth,'netincome': netincome,'shareholderequity': shareholderequity,'roa': 
                  roa,'longtermdebt': longtermdebt,'interestexpense': interestexpense,'ebitda': ebitda, 'ST Debt': stdebt,
                     'Cash': cash, 'Sales':sales, 'Shares': shares},
                      index=tfIndex)
    return df

def getelementinlist(list,element):
    try:
        return list[element]
    except:
        return 0





In [20]:
getfinancialreportingdf('FB', timeframe = 'Q')

Unnamed: 0,eps,epsgrowth,netincome,shareholderequity,roa,longtermdebt,interestexpense,ebitda,ST Debt,Cash,Sales,Shares
MRQ-4,0.92,-,2.62B,88.76B,75.86%,7.47B,-,8.13B,731M,48.61B,16.89B,2.86B
MRQ-3,2.13,132.91%,6.09B,94B,75.55%,8.76B,-,8.6B,802M,52.28B,17.65B,2.85B
MRQ-2,2.58,20.70%,7.35B,101.05B,75.77%,9.93B,-,10.33B,1.15B,54.86B,21.08B,2.85B
MRQ-1,1.72,-33.25%,4.9B,105.3B,76.10%,9.93B,-,7.49B,880M,60.43B,17.74B,2.85B
MRQ,1.82,5.66%,5.18B,110.45B,79.07%,10.07B,-,7.67B,937M,58.55B,18.69B,2.85B


<font color=red> **Note: each loop takes approximately 10 to 15 minutes to complete. The code will print out what stock you're on in the output i.e. '1 out of 505'. You can just let run in the background. </font>

In [21]:
# run quarterly financials

SPListQ = [] 
counter = 0

for ticker in Symbol:
    counter = counter + 1

    try:
        df = getfinancialreportingdf(ticker, timeframe = 'Q')
        df['Ticker'] = ticker
        SPListQ.append(df)
        print(str(counter) + ' of ' + str(len(Symbol)))
    except:
        print('Error Downloading: ' + ticker)
        





1 of 505
2 of 505
3 of 505
4 of 505
5 of 505
6 of 505
7 of 505
8 of 505
9 of 505
10 of 505
11 of 505
12 of 505
13 of 505
14 of 505
15 of 505
16 of 505
17 of 505
18 of 505
19 of 505
20 of 505
21 of 505
22 of 505
23 of 505
24 of 505
25 of 505
26 of 505
27 of 505
28 of 505
29 of 505
30 of 505
31 of 505
32 of 505
33 of 505
34 of 505
35 of 505
36 of 505
37 of 505
38 of 505
39 of 505
40 of 505
41 of 505
42 of 505
43 of 505
44 of 505
45 of 505
46 of 505
47 of 505
48 of 505
49 of 505
50 of 505
51 of 505
52 of 505
53 of 505
54 of 505
55 of 505
56 of 505
57 of 505
58 of 505
59 of 505
60 of 505
61 of 505
62 of 505
63 of 505
64 of 505
65 of 505
66 of 505
67 of 505
68 of 505
69 of 505
70 of 505
71 of 505
72 of 505
73 of 505
74 of 505
75 of 505
76 of 505
77 of 505
78 of 505
79 of 505
80 of 505
81 of 505
82 of 505
83 of 505
84 of 505
85 of 505
86 of 505
87 of 505
88 of 505
89 of 505
90 of 505
91 of 505
92 of 505
93 of 505
94 of 505
95 of 505
96 of 505
97 of 505
98 of 505
99 of 505
100 of 505
101 of 5

In [22]:
# run annual financials

SPList = [] 
counter = 0

for ticker in Symbol:
    counter = counter + 1

    try:
        df = getfinancialreportingdf(ticker)
        df['Ticker'] = ticker
        SPList.append(df)
        print(str(counter) + ' of ' + str(len(Symbol)))
    except:
        print('Error Downloading: ' + ticker)
        

1 of 505
2 of 505
3 of 505
4 of 505
5 of 505
6 of 505
7 of 505
8 of 505
9 of 505
10 of 505
11 of 505
12 of 505
13 of 505
14 of 505
15 of 505
16 of 505
17 of 505
18 of 505
19 of 505
20 of 505
21 of 505
22 of 505
23 of 505
24 of 505
25 of 505
26 of 505
27 of 505
28 of 505
29 of 505
30 of 505
31 of 505
32 of 505
33 of 505
34 of 505
35 of 505
36 of 505
37 of 505
38 of 505
39 of 505
40 of 505
41 of 505
42 of 505
43 of 505
44 of 505
45 of 505
46 of 505
47 of 505
48 of 505
49 of 505
50 of 505
51 of 505
52 of 505
53 of 505
54 of 505
55 of 505
56 of 505
57 of 505
58 of 505
59 of 505
60 of 505
61 of 505
62 of 505
63 of 505
64 of 505
65 of 505
66 of 505
67 of 505
68 of 505
69 of 505
70 of 505
71 of 505
72 of 505
73 of 505
74 of 505
75 of 505
76 of 505
77 of 505
78 of 505
79 of 505
80 of 505
81 of 505
82 of 505
83 of 505
84 of 505
85 of 505
86 of 505
87 of 505
88 of 505
89 of 505
90 of 505
91 of 505
92 of 505
93 of 505
94 of 505
95 of 505
96 of 505
97 of 505
98 of 505
99 of 505
100 of 505
101 of 5

In [23]:
filepath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
os.chdir(filepath)
print(os.getcwd())

C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData


In [24]:
StockDF = pd.concat(SPList)
StockDFQ = pd.concat(SPListQ)

In [25]:
StockDF.to_csv(exportname + '_A.csv')
StockDFQ.to_csv(exportname + '_Q.csv')