# Data Wrangling - Mass Stock Price Download using yfinance package, Technical Analysis, and Final Data Merge

Note: this workbook is 1 of 4 scraping and extracting processes that ultimately aggregate into the Technical Indicators workbook in the Data Wrangling phase of capstone 3


* Fundamental Scraper - scrapes 5 years worth of fundamental company financial data from MarketWatch using Beautiful Soup from the S&P 500 list
* Fundamental Calcs - imports scraped data from the scraper tool, converts text data to numeric - i.e. 5.00M to 5000000 - using regular expressions, and calculates additonal financial metrics
* Analyst Scraper - scrapes analyst buy, sell, hold ratings for all S&P 500 stocks and downloads to .csv file

* <span style="color:red"> **Mass Yahoo Download (this book)** </span> - downloads 5 years of daily stock pricing data from the S&P 500, Runs complex Directional Index, ADX, Bollinger Band, and other financial charting data. Merges data from fundamental and analyst scrapers

In [3]:
import pandas as pd
import yfinance as yf
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from matplotlib import pyplot as plt
import seaborn as sns
import os




In [4]:
importpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\Stock Import Lists'
importfile = 'SandP.csv'
exportpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
exportfile = 'Momentum'
os.chdir(importpath)

stocks = pd.read_csv(importfile, encoding= 'unicode_escape')
sdate = '2015-01-01'

In [5]:
# downloads stock data 1 at a time through loop and appends to dataframe
# yfinance as the option to take an entire list at once, however I've found this method tends to time out if I'm using
# a large set of stocks 

m = []
tickers = stocks.Symbol
n = 1
for t in tickers:
    try:
        df = yf.download(t,sdate, progress = False, group_by = 'Ticker')
        df['Ticker'] = t
        #print(n, ' out of ', len(tickers))
        m.append(df)
        n = n+1
        if n % 100 == 0:
            print('Status Check: ', n, ' printed')
    except:
        print(t, ' not found')


1 Failed download:
- BRK.B: No data found, symbol may be delisted

1 Failed download:
- BF.B: No data found for this date range, symbol may be delisted
Status Check:  100  printed
Status Check:  200  printed
Status Check:  300  printed
Status Check:  400  printed
Status Check:  500  printed


### Quantitative Analysis

In [6]:
#make an archive of Daily Pricing Before running the Quantitative Analysis Functions. They take a while
DailyPrices = pd.concat(m)
DailyPrices.to_csv('Daily Pricing Detail.csv')

In [11]:
DailyPrices = DailyPrices.reset_index()

In [12]:
#Quantitative Analysis Functions



def TR(row, axis = 1):
    
    H = row['High']
    L = row['Low']
    C = row['Close']
    yC = row['yC']
    
    return max((H-L), abs(H-yC), abs(L-yC))

def OBV(row, axis = 1)
    
    OBV = []
    
    C = row['Close']
    yc = row['yC']
    v = row['Volume']
    yv = row['yv']
    
    if C > yc:
        OBV.append(v + OBV)

def DM(row, axis = 1, d = 'PDM'):
    
    
    tH = row['High']
    yH = row['yH']
    
    tL = row['Low']
    yL  = row['yL']
    
    moveUp = tH - yH
    moveDown = yL - tL
    
    #calculate PDM
    if moveUp > 0 and moveUp > moveDown:
        PDM = moveUp
    else:
        PDM = 0
    
   #calculate NDM
    if moveDown > 0 and moveDown > moveUp:
        NDM = moveDown
    else:
        NDM = 0
        
    if d == 'PDM':
        return PDM
    else:
        return NDM

    
def Smoothed(Metric, period, ADX = False):
    
    if ADX == False:
        Base = Metric.rolling(window = period).mean()[period-1]
    else:
        Base = Metric.rolling(window = period).mean()[period*2 - 1]
    
    Metric = list(Metric)
    period = period -1
    lstlen = len(Metric)
    lstSmoothed = np.empty(lstlen)

    for i in range(lstlen):

        if i < period:
            lstSmoothed[i] = 0
        elif i == period:
            lstSmoothed[i] = Base
        else:
            lstSmoothed[i] = (lstSmoothed[i-1] * period + Metric[i])/(period + 1)


    return lstSmoothed

def OLS_Slope(y, lookback = 7):
    X = np.arange(0,lookback,1)
    X = sm.add_constant(X)
    results = sm.OLS(y,X).fit()
    sl = results.params[1]
    
    
                
    return sl

def OLS_R(y, lookback = 7):
    X = np.arange(0,lookback,1)
    X = sm.add_constant(X)
    results = sm.OLS(y,X).fit()
  
    r2 = results.rsquared
                
   
                
    return r2

In [57]:
def Momentum(dfPrices):

    m = []
    tickers = list(dfPrices.Ticker.value_counts().index)
    dfPrices = dfPrices.set_index('Date')
    for t in tickers:
       # try:
            
            df = dfPrices[dfPrices.Ticker == t]
            df =df.drop(columns = 'Ticker')
            
            
            df['yH'] = df[['High']].shift(1)
            df['yL'] = df[['Low']].shift(1)
            df['yC'] = df[['Close']].shift(1)
            df['yv'] = df[['Volume']].shift(1)
            
            df['PDM'] = df.apply(DM, axis = 1, d='PDM')
            df['NDM'] = df.apply(DM, axis = 1,d = 'NDM')
            df['TR'] = df.apply(TR, axis = 1)

            ATR = Smoothed(df['TR'], 14)
            PDM_Smooth = Smoothed(df['PDM'], 14)
            NDM_Smooth =Smoothed(df['NDM'], 14)
            DI_Plus = PDM_Smooth/ATR * 100
            DI_Neg = NDM_Smooth/ATR * 100
            DI_Index =abs(DI_Plus - DI_Neg)/abs(DI_Plus+ DI_Neg) * 100
            ADX =Smoothed(pd.Series(DI_Index), 14, ADX = True)


            dfATR = pd.DataFrame(ATR, index = df.index, columns = ['ATR'])
            dfDI_Plus = pd.DataFrame(DI_Plus, index = df.index, columns = ['DI_Plus'])
            dfDI_Neg = pd.DataFrame(DI_Neg, index = df.index, columns = ['DI_Neg'])
            ADX = pd.DataFrame(ADX, index = df.index, columns = ['ADX'])
            dfNew = pd.concat([df,dfATR, dfDI_Plus, dfDI_Neg, ADX], axis = 1)

            dfNew = dfNew[['Close', 'DI_Plus', 'DI_Neg', 'ADX']]
            
            dfNew['DI_Plus_Slope'] = dfNew.DI_Plus.rolling(7).apply(OLS_Slope)
            dfNew['DI_Plus_R'] = dfNew.DI_Plus.rolling(7).apply(OLS_R)

            
            dfNew['SMA'] = dfNew['Close'].rolling(window = 20).mean()
            dfNew['UpperB'] = dfNew.SMA + dfNew['Close'].rolling(window = 20).agg(np.std, ddof = 0) * 2
            dfNew['LowerB'] = dfNew.SMA - dfNew['Close'].rolling(window = 20).agg(np.std, ddof = 0) * 2


            dfNew['Off_SMA'] = (dfNew.Close - dfNew.SMA)/dfNew.SMA * 100
            dfNew['Off_LB'] = (dfNew.Close - dfNew.LowerB)/dfNew.LowerB * 100
            dfNew['Symbol'] = t
            m.append(dfNew)
        #except:
         #   print(t, ' not found')
            
    
    return pd.concat(m)

In [5]:
#import here if you already have a stock download
os.chdir(importpath)
DailyPrices = pd.read_csv('Daily Pricing Detail.csv')

In [58]:
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.4f}'.format

Quant = Momentum(DailyPrices)

# Run Final Analysis and Calculations. PE Ratio, Sales to Earnings, Industry Benchmarks

In [59]:
Quant = Quant.dropna()
Quant.shape

(683855, 12)

In [60]:
Quant = Quant.reset_index()

In [61]:


importpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
os.chdir(importpath)

#import datasets from cleaning
fund = 'Fundamental_Final.csv'
analyst = 'Analysts.csv'
quant = 'Historical Quant Prices.csv'

dfFund = pd.read_csv(fund).drop(columns = ['Unnamed: 0'])
dfanalyst = pd.read_csv(analyst).drop(columns = ['Unnamed: 0'])
dfquant = Quant

dfquant = dfquant.rename(columns = {'Symbol':'Ticker'})
dfFund['Key'] = dfFund[['index','Ticker']].astype(str).apply(lambda x: '_'.join(x), axis=1)

#use prior year fundmentals to avoid future bias in model.
#note we're pulling based on year - 1
dfquant['Year'] = pd.DatetimeIndex(dfquant['Date']).year - 1
dfquant['Key'] = dfquant[['Year','Ticker']].astype(str).apply(lambda x: '_'.join(x), axis=1)

#filter specific fields from fundamental sheet
fields = ['Key','eps', 'ROE', 'Sector','D2C','epsgrowth', 'Sales', 'Shares']
dfFund = dfFund[fields]

#replace inf values from my ROE calculation
dfFund['ROE'] = dfFund.ROE.replace([np.inf, -np.inf], 0)


#pull percent buy and ticker from the analyst rating data source
dfRating = dfanalyst[['Symbol', 'Percent_Buy']]
dfRating.columns = ['Ticker', 'Percent_Buy']


#merge everyone together

df = pd.merge(dfquant, dfFund, on = 'Key')
df = pd.merge(df, dfRating, on = 'Ticker')



In [62]:
df['Date'] = pd.to_datetime(df['Date'])

In [70]:
earnings_cal = pd.read_csv('Earnings_Calendar.csv')
df['datekey'] = df['Ticker'] + df.Date.astype('str')
earnings_cal['datekey'] = earnings_cal['Ticker'] + earnings_cal.report_date.astype('str')

earnings_cal['E'] = 1
earnings_cal = earnings_cal[['E', 'datekey']]
df = df.merge(earnings_cal, how = 'left', on = 'datekey')
df['E'] = df.E.fillna(0)



In [74]:
df['E_Season'] = 0
stocks = df.Ticker.value_counts().index
for t in stocks:
    x = 0
    y = 0
    e = df.loc[df['Ticker'] == t, 'E'].tolist()
    szn = []
    szn = np.zeros(len(e))
    
    while x < (len(e)):
        
        if x < (len(e)-7):
            
            if e[x] == 1:
                szn[x:x+7] = 1
                
            #else:
                #szn[x] = 0
        x = x + 1
    
    while y < (len(e)):
        
        if y > 7:
            
            if e[y] == 1:
                szn[y-6:y] = 1
            #else:
             #   szn[y] = 0
        y = y + 1
   
    df.loc[df['Ticker'] == t, 'E_Season'] = szn

In [75]:
df['PE_Ratio'] = df.SMA/df.eps
df.PE_Ratio = df.PE_Ratio.replace([np.inf, -np.inf], 0)

AverageSectorPE = pd.DataFrame(df.groupby('Sector')['PE_Ratio'].mean()).reset_index()
AverageSectorPE.columns = ['Sector', 'AverageSectorPE']
df = pd.merge(df,AverageSectorPE, on = 'Sector')
df['Relative_PE'] = (df.PE_Ratio - df.AverageSectorPE) / df.AverageSectorPE
df['Sales_Ratio'] = df.SMA/ (df.Sales/df.Shares)
df.Sales_Ratio = df.Sales_Ratio.replace([np.inf, -np.inf], 0)

AverageSectorSR = pd.DataFrame(df.groupby('Sector')['Sales_Ratio'].mean()).reset_index()
AverageSectorSR.columns = ['Sector', 'AverageSectorSR']
df = pd.merge(df,AverageSectorSR, on = 'Sector')
df['Relative_SR'] = (df.Sales_Ratio - df.AverageSectorSR) / df.AverageSectorSR
df['Relative_SR'] = df['Relative_SR'].fillna(0)
df['Relative_PE'] = df['Relative_PE'].fillna(0)
df['month'] = pd.DatetimeIndex(df['Date']).month



In [79]:
todrop = ['DI_Neg','Year', 'Key','Shares','datekey','E','AverageSectorPE','AverageSectorSR','month']
dfFinal = df.drop(columns = todrop)
pd.DataFrame(dfFinal.columns)

Unnamed: 0,0
0,Date
1,Close
2,DI_Plus
3,ADX
4,DI_Plus_Slope
5,DI_Plus_R
6,SMA
7,UpperB
8,LowerB
9,Off_SMA


In [None]:
stocks = df.Ticker.value_counts().index

data = df.dropna()

data['Five_Day'] = 0
data['Thirty_Day'] = 0
data['Sixty_Day'] = 0

for interval in stocks:
     
    Five_Day_Obs = []
    thirty_Day_Obs = []
    sixty_Day_Obs = []
    x = 0
    
    close_prices = data.loc[data['Ticker'] == interval, 'Close'].tolist()
    

    while x < (len(close_prices)):
        if x < (len(close_prices)-5):
            if ((close_prices[x+1] + close_prices[x+2] + close_prices[x+3] + close_prices[x+4] + close_prices[x+5])/5) > close_prices[x]:
                Five_Day_Obs.append(1)
                #print(close_prices[x+1])
            else:
                Five_Day_Obs.append(0)
        else:
            Five_Day_Obs.append(0)
        x+=1
    y = 0
    
    while y < (len(close_prices)):
        if y < (len(close_prices)-30):
            ThirtyDayCalc = 0
            y2 = 0
            while y2 < 30:
                ThirtyDayCalc = ThirtyDayCalc + close_prices[y+y2]
                y2 += 1
            if (ThirtyDayCalc/30) > close_prices[y]:
                thirty_Day_Obs.append(1)
            else:
                thirty_Day_Obs.append(0)
        else:
            thirty_Day_Obs.append(0)
        y+=1
    z = 0
   
    while z < (len(close_prices)):
        if z < (len(close_prices)-60):
            SixtyDayCalc = 0
            z2 = 0
            while z2 < 60:
                SixtyDayCalc = SixtyDayCalc + close_prices[z+z2]
                z2 += 1
            if (SixtyDayCalc/60) > close_prices[z]:
                sixty_Day_Obs.append(1)
            else:
                sixty_Day_Obs.append(0)
        else:
            sixty_Day_Obs.append(0)
        z+=1
        
    data.loc[data['Ticker'] == interval, 'Five_Day'] = Five_Day_Obs
    data.loc[data['Ticker'] == interval, 'Thirty_Day'] = thirty_Day_Obs
    data.loc[data['Ticker'] == interval, 'Sixty_Day'] = sixty_Day_Obs
    #print('Data for ' + interval + ' complete' )

In [None]:
os.chdir(exportpath)
dfFinal.to_csv('Historical Quant Prices_0816.csv')