# Data Wrangling - Fundamental Data Cleaning and Converion

Note: this workbook is 1 of 4 scraping and extracting processes that ultimately aggregate into the Technical Indicators workbook in the Data Wrangling phase of capstone 3


* Fundamental Scraper - scrapes 5 years worth of fundamental company financial data from MarketWatch using Beautiful Soup from the S&P 500 list
* <span style="color:red"> **Fundamental Calcs (this workbook)** </span> - imports scraped data from the scraper tool, converts text data to numeric - i.e. 5.00M to 5000000 - using regular expressions, and calculates additonal financial metrics
* Analyst Scraper - scrapes analyst buy, sell, hold ratings for all S&P 500 stocks and downloads to .csv file
* Mass Yahoo Download and Technical Analysis - downloads 5 years of daily stock pricing data from the S&P 500, Runs complex Directional Index, ADX, Bollinger Band, and other financial charting data. Merges data from fundamental and analyst scrapers. 

In [18]:
import pandas as pd
from datetime import date
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

pd.reset_option('display.float_format')

In [19]:
filepath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
companylistpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\Stock Import Lists'
exportpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'

filenameQ = 'Fundamentals_Q.csv'
filenameA = 'Fundamentals_A.csv'
companylist = 'SandP.csv'
exportname = 'Fundamental_Final.csv'

os.chdir(filepath)

**Run the Scraper Notebook first to export Fundamentals_A.csv and Fundamentals_Q.csv to the working directory**

In [20]:
#import scraped stocks

df = pd.read_csv(filenameA, index_col = 0) #YE2019 
dfQ = pd.read_csv(filenameQ, index_col = 0) #MRQ
#PY_PRICE = pd.read_csv('YEPrices.csv',encoding= 'unicode_escape')
#Rolling = pd.read_csv('RollingPrices.csv', encoding = 'unicode_escape')
df = df.replace('-', 0).reset_index()
dfQ = dfQ.replace('-', 0).reset_index()
#Get List of S&P
os.chdir(companylistpath)
companies = pd.read_csv(companylist,encoding= 'unicode_escape')
Symbol = companies.Symbol
companies.columns = ['Ticker', 'Name', 'Sector']

### Functions to Convert Numerical Data Displayed as Text to Actual Numbers

In [21]:

def pconvert(x):
    x = str(x).replace('%','')
    x = str(x).replace(',','')
    return x


def convert(string):
    
    try:
        #try a mathematical calculation to check if already a number
        # if it is simply return value
        int(64) * int(string)
        return(float(string))
    
    except:
               
        #find parenthetical, B, or M indicators
        regex = re.compile(r'[\( \)]')
        p = ''.join(regex.findall(string))

        regex = re.compile(r'B')
        b = ''.join(regex.findall(string))

        regex = re.compile(r'M')
        m = ''.join(regex.findall(string))

        #negative if enclosed in parenthetical
        if p == '()':
            neg = -1
        else:
            neg = 1

        #Billions if ends in B
        if b == 'B':
            mult = 1000000

        #Millions if ends in B
        elif m == 'M':
            mult = 1000
        else:
            mult = 1

        #pull out the number
        regex = re.compile(r'\d+(\..*)?\d')
        value = regex.search(string)

        #convert to integer
        try:
            value = float(value.group())
        except:
            value = 0

        #multiply by negative flag and multiple established above
        return round(float(value) * neg * mult,2)

In [22]:
#apply conversion functions and other key financial metric calculations including Debt to Cash, ROE, Total Debt

def DataConversions(data):
    data.epsgrowth = data.epsgrowth.apply(pconvert).astype(float)
    data.roa = data.roa.apply(pconvert).astype(float)
    data.eps = data.eps.apply(convert).astype(float)
    data.netincome = data.netincome.apply(convert).astype('int64')
    data.shareholderequity = data.shareholderequity.apply(convert).astype('int64')
    data.longtermdebt = data.longtermdebt.apply(convert).astype('int64')
    data.interestexpense = data.interestexpense.apply(convert).astype('int64')
    data.ebitda = data.ebitda.apply(convert).astype('int64')
    data['ST Debt'] = data['ST Debt'].apply(convert).astype('int64')
    data.Cash = data.Cash.apply(convert).astype('int64')
    data = data[data.Cash>0] #remove companies with no cash
    data['TotalDebt'] = data.longtermdebt + data['ST Debt']
    data['D2C'] = data.TotalDebt / data.Cash
    data['ROE'] = data.netincome/data.shareholderequity
    data['Sales'] = data.Sales.apply(convert).astype(float)
    data['Shares'] = data.Shares.apply(convert).astype(float)
    return data

In [23]:
# convert the text to numbers based on my functions
import warnings
warnings.filterwarnings('ignore')
df = DataConversions(df)
dfQ = DataConversions(dfQ)


In [24]:
#create a dataframe for the trailing 12 month eps
dfQ = dfQ[dfQ['index'] != 'MRQ-4'].sort_values(by = 'Ticker')
epsQ = dfQ.loc[:,('Ticker','eps')]
epsQ.columns = ['Ticker', 'epsTTM']
epsQ = epsQ.groupby('Ticker').sum()

#create a dataframe for the most recent quarter's Debt to Cash Ratio
MRQ_D2C = dfQ[dfQ['index'] == 'MRQ'].loc[:,('Ticker','D2C', 'TotalDebt', 'epsgrowth')]
MRQ_D2C.columns = ['Ticker', 'MRQ_D2C', 'MRQ_TotalDebt', 'MRQ_epsGrowth']


In [25]:
new = pd.merge(df,epsQ, on ='Ticker', how = 'left')
new = pd.merge(new,MRQ_D2C, on = 'Ticker', how = 'left' )
new = pd.merge(new, companies, on = 'Ticker')
os.chdir(exportpath)
new.to_csv(exportname)