# Data Wrangling - Mass Stock Price Download using yfinance package, Technical Analysis, and Final Data Merge

Note: this workbook is 1 of 4 scraping and extracting processes that ultimately aggregate into the Technical Indicators workbook in the Data Wrangling phase of capstone 3


* Fundamental Scraper - scrapes 5 years worth of fundamental company financial data from MarketWatch using Beautiful Soup from the S&P 500 list
* Fundamental Calcs - imports scraped data from the scraper tool, converts text data to numeric - i.e. 5.00M to 5000000 - using regular expressions, and calculates additonal financial metrics
* Analyst Scraper - scrapes analyst buy, sell, hold ratings for all S&P 500 stocks and downloads to .csv file

* <span style="color:red"> **Mass Yahoo Download (this book)** </span> - downloads 5 years of daily stock pricing data from the S&P 500, Runs complex Directional Index, ADX, Bollinger Band, and other financial charting data. Merges data from fundamental and analyst scrapers

In [1]:
import pandas as pd
import yfinance as yf
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from matplotlib import pyplot as plt
import seaborn as sns
import os




In [2]:
importpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\Stock Import Lists'
importfile = 'SandP.csv'
exportpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
exportfile = 'Momentum'
os.chdir(importpath)

stocks = pd.read_csv(importfile, encoding= 'unicode_escape')
sdate = '2015-01-01'

In [3]:
# downloads stock data 1 at a time through loop and appends to dataframe
# yfinance as the option to take an entire list at once, however I've found this method tends to time out if I'm using
# a large set of stocks 

m = []
tickers = stocks.Symbol
n = 1
for t in tickers:
    try:
        df = yf.download(t,sdate, progress = False, group_by = 'Ticker')
        df['Ticker'] = t
        print(n, ' out of ', len(tickers))
        m.append(df)
        n = n+1
    except:
        print(t, ' not found')

1  out of  505
2  out of  505
3  out of  505
4  out of  505
5  out of  505
6  out of  505
7  out of  505
8  out of  505
9  out of  505
10  out of  505
11  out of  505
12  out of  505
13  out of  505
14  out of  505
15  out of  505
16  out of  505
17  out of  505
18  out of  505
19  out of  505
20  out of  505
21  out of  505
22  out of  505
23  out of  505
24  out of  505
25  out of  505
26  out of  505
27  out of  505
28  out of  505
29  out of  505
30  out of  505
31  out of  505
32  out of  505
33  out of  505
34  out of  505
35  out of  505
36  out of  505
37  out of  505
38  out of  505
39  out of  505
40  out of  505
41  out of  505
42  out of  505
43  out of  505
44  out of  505
45  out of  505
46  out of  505
47  out of  505
48  out of  505
49  out of  505
50  out of  505
51  out of  505
52  out of  505
53  out of  505
54  out of  505
55  out of  505
56  out of  505
57  out of  505
58  out of  505
59  out of  505
60  out of  505
61  out of  505
62  out of  505
63  out of  505
6

481  out of  505
482  out of  505
483  out of  505
484  out of  505
485  out of  505
486  out of  505
487  out of  505
488  out of  505
489  out of  505
490  out of  505
491  out of  505
492  out of  505
493  out of  505
494  out of  505
495  out of  505
496  out of  505
497  out of  505
498  out of  505
499  out of  505
500  out of  505
501  out of  505
502  out of  505
503  out of  505
504  out of  505
505  out of  505


In [4]:
#make an archive of Daily Pricing Before running the Quantitative Analysis Functions. They take a while
DailyPrices = pd.concat(m)
DailyPrices.to_csv('Daily Pricing Detail.csv')

In [3]:
#Quantitative Analysis Functions



def TR(row, axis = 1):
    
    H = row['High']
    L = row['Low']
    C = row['Close']
    yC = row['yC']
    
    return max((H-L), abs(H-yC), abs(L-yC))



def DM(row, axis = 1, d = 'PDM'):
    
    
    tH = row['High']
    yH = row['yH']
    
    tL = row['Low']
    yL  = row['yL']
    
    moveUp = tH - yH
    moveDown = yL - tL
    
    #calculate PDM
    if moveUp > 0 and moveUp > moveDown:
        PDM = moveUp
    else:
        PDM = 0
    
   #calculate NDM
    if moveDown > 0 and moveDown > moveUp:
        NDM = moveDown
    else:
        NDM = 0
        
    if d == 'PDM':
        return PDM
    else:
        return NDM

    
def Smoothed(Metric, period, ADX = False):
    
    if ADX == False:
        Base = Metric.rolling(window = period).mean()[period-1]
    else:
        Base = Metric.rolling(window = period).mean()[period*2 - 1]
    
    Metric = list(Metric)
    period = period -1
    lstlen = len(Metric)
    lstSmoothed = np.empty(lstlen)

    for i in range(lstlen):

        if i < period:
            lstSmoothed[i] = 0
        elif i == period:
            lstSmoothed[i] = Base
        else:
            lstSmoothed[i] = (lstSmoothed[i-1] * period + Metric[i])/(period + 1)


    return lstSmoothed

def Slope(Metric, lookback):
    reg = LinearRegression()


    time = np.arange(0,lookback,1)
    lstlen = len(Metric)
    sl = np.empty(lstlen)
    
    for i in range(lstlen):
        
        
        
        y = np.array(Metric[i-(lookback-1):(i+1)]).reshape(-1,1)
        X = time.reshape(-1,1)
        
        if np.isnan(y).sum() > 0:
            sl[i]=0
        else:
            
            if len(y) == lookback:
                reg.fit(X,y)
                sl[i] = reg.coef_
                
                
    return sl

In [4]:
def Momentum(dfPrices):

    m = []
    tickers = list(dfPrices.Ticker.value_counts().index)
    dfPrices = dfPrices.set_index('Date')
    for t in tickers:
        #try:
            
            df = dfPrices[dfPrices.Ticker == t]
            df =df.drop(columns = 'Ticker')
            
            
            df['yH'] = df[['High']].shift(1)
            df['yL'] = df[['Low']].shift(1)
            df['yC'] = df[['Close']].shift(1)

            df['PDM'] = df.apply(DM, axis = 1, d='PDM')
            df['NDM'] = df.apply(DM, axis = 1,d = 'NDM')
            df['TR'] = df.apply(TR, axis = 1)

            ATR = Smoothed(df['TR'], 14)
            PDM_Smooth = Smoothed(df['PDM'], 14)
            NDM_Smooth =Smoothed(df['NDM'], 14)
            DI_Plus = PDM_Smooth/ATR * 100
            DI_Neg = NDM_Smooth/ATR * 100
            DI_Index =abs(DI_Plus - DI_Neg)/abs(DI_Plus+ DI_Neg) * 100
            ADX =Smoothed(pd.Series(DI_Index), 14, ADX = True)


            dfATR = pd.DataFrame(ATR, index = df.index, columns = ['ATR'])
            dfDI_Plus = pd.DataFrame(DI_Plus, index = df.index, columns = ['DI_Plus'])
            dfDI_Neg = pd.DataFrame(DI_Neg, index = df.index, columns = ['DI_Neg'])
            ADX = pd.DataFrame(ADX, index = df.index, columns = ['ADX'])
            dfNew = pd.concat([df,dfATR, dfDI_Plus, dfDI_Neg, ADX], axis = 1)

            DIN_Slope = pd.DataFrame(Slope(dfNew.DI_Neg,7), index = dfNew.index, columns = ['DI_Neg_Slope'])
            DIP_Slope = pd.DataFrame(Slope(dfNew.DI_Plus,7), index = dfNew.index, columns = ['DI_Plus_Slope'])

            dfNew = pd.concat([dfNew, DIN_Slope, DIP_Slope ], axis = 1)


            dfNew = dfNew[['Close', 'DI_Plus', 'DI_Neg', 'ADX', 'DI_Neg_Slope', 'DI_Plus_Slope']]
            dfNew['SMA'] = dfNew['Close'].rolling(window = 20).mean()
            dfNew['UpperB'] = dfNew.SMA + dfNew['Close'].rolling(window = 20).agg(np.std, ddof = 0) * 2
            dfNew['LowerB'] = dfNew.SMA - dfNew['Close'].rolling(window = 20).agg(np.std, ddof = 0) * 2


            dfNew['Off_SMA'] = (dfNew.Close - dfNew.SMA)/dfNew.SMA * 100
            dfNew['Off_LB'] = (dfNew.Close - dfNew.LowerB)/dfNew.LowerB * 100
            dfNew['Symbol'] = t
            m.append(dfNew)
        #except:
         #   print(t, ' not found')
            
    
    return pd.concat(m)

In [5]:
os.chdir(importpath)
DailyPrices = pd.read_csv('Daily Pricing Detail.csv')

In [6]:
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.4f}'.format

Quant = Momentum(DailyPrices)

# Run Final Analysis and Calculations. PE Ratio, Sales to Earnings, Industry Benchmarks

In [28]:
Quant = Quant.dropna()

In [39]:
# Add 7 day forward looking price change

QuantRev = Quant.sort_values(by = ['Symbol', 'Date'],ascending = (False,False))
QuantRev['D'] = QuantRev['Close'].pct_change(periods = 7) * - 1
Quant = QuantRev.sort_values(by = ['Symbol', 'Date'],ascending = (False,True))

importpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
os.chdir(importpath)

#import datasets from cleaning
fund = 'Fundamental_Final.csv'
analyst = 'Analysts.csv'
quant = 'Historical Quant Prices.csv'

dfFund = pd.read_csv(fund).drop(columns = ['Unnamed: 0'])
dfanalyst = pd.read_csv(analyst).drop(columns = ['Unnamed: 0'])
dfquant = Quant

dfquant = dfquant.rename(columns = {'Symbol':'Ticker'})
dfFund['Key'] = dfFund[['index','Ticker']].astype(str).apply(lambda x: '_'.join(x), axis=1)
dfquant['Year'] = pd.DatetimeIndex(dfquant['Date']).year - 1
dfquant['Key'] = dfquant[['Year','Ticker']].astype(str).apply(lambda x: '_'.join(x), axis=1)

#filter specific fields from fundamental sheet
fields = ['Key','eps', 'ROE', 'Sector','D2C','epsgrowth', 'Sales', 'Shares']
dfFund = dfFund[fields]

#replace inf values from my ROE calculation
dfFund['ROE'] = dfFund.ROE.replace([np.inf, -np.inf], 0)


#pull percent buy and ticker from the analyst rating data source
dfRating = dfanalyst[['Symbol', 'Percent_Buy']]
dfRating.columns = ['Ticker', 'Percent_Buy']


#merge everyone together

df = pd.merge(dfquant, dfFund, on = 'Key')
df = pd.merge(df, dfRating, on = 'Ticker')

In [40]:
df['PE_Ratio'] = df.SMA/df.eps
df.PE_Ratio = df.PE_Ratio.replace([np.inf, -np.inf], 0)

AverageSectorPE = pd.DataFrame(df.groupby('Sector')['PE_Ratio'].mean())
IQRPE = pd.DataFrame(df.groupby('Sector')['PE_Ratio'].quantile(.75) - df.groupby('Sector')['PE_Ratio'].quantile(.25))
Quartile3 = pd.DataFrame(df.groupby('Sector')['PE_Ratio'].quantile(.75))
Quartile1 = pd.DataFrame(df.groupby('Sector')['PE_Ratio'].quantile(.25))

peSector = pd.concat([AverageSectorPE, IQRPE, Quartile1, Quartile3], axis = 1).reset_index()
peSector.columns = ['Sector', 'AverageSectorPE','IQRPE','Quartile1','Quartile3' ]
peSector['Upper'] = 1.5 * peSector.IQRPE + peSector.Quartile3
peSector['Lower'] = 1.5 * peSector.IQRPE - peSector.Quartile1

peSector = peSector[['Sector','AverageSectorPE', 'Upper', 'Lower']]
df = pd.merge(df,peSector, on = 'Sector')
df['Relative_PE'] = (df.PE_Ratio - df.AverageSectorPE) / df.AverageSectorPE

In [41]:
df['Sales_Ratio'] = df.SMA/ (df.Sales/df.Shares)
df.Sales_Ratio = df.Sales_Ratio.replace([np.inf, -np.inf], 0)

AverageSectorSR = pd.DataFrame(df.groupby('Sector')['Sales_Ratio'].mean())
IQRSR = pd.DataFrame(df.groupby('Sector')['Sales_Ratio'].quantile(.75) - df.groupby('Sector')['Sales_Ratio'].quantile(.25))
Quartile3 = pd.DataFrame(df.groupby('Sector')['Sales_Ratio'].quantile(.75))
Quartile1 = pd.DataFrame(df.groupby('Sector')['Sales_Ratio'].quantile(.25))
srSector = pd.concat([AverageSectorSR, IQRSR, Quartile1, Quartile3], axis = 1).reset_index()
srSector.columns = ['Sector', 'AverageSectorSR','IQRSR','Quartile1','Quartile3' ]
srSector['UpperSR'] = 1.5 * srSector.IQRSR + srSector.Quartile3
srSector['LowerSR'] = 1.5 * srSector.IQRSR - srSector.Quartile1

SR = srSector[['Sector', 'AverageSectorSR', 'UpperSR', 'LowerSR']]
df = pd.merge(df,SR, on = 'Sector')
df['Relative_SR'] = (df.Sales_Ratio - df.AverageSectorSR) / df.AverageSectorSR
df['Relative_SR'] = df['Relative_SR'].fillna(0)
df['month'] = pd.DatetimeIndex(df['Date']).month

In [42]:
os.chdir(exportpath)
df.to_csv('Historical Quant Prices.csv')