In [2]:
# Importing Libraries

In [3]:
from collections import Counter
import bs4 as bs
import datetime as dt
import requests
import pickle
import pandas_datareader as web
import os
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np

In [4]:
yf.pdr_override()

In [5]:
# Making list of ticker

In [6]:
def save_sp500_tickers():
    resp=requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup=bs.BeautifulSoup(resp.text,"lxml")
    table=soup.find('table',{'class':'wikitable sortable'})
    tickers=[]
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text.replace('.', '-')
        ticker = ticker[:-1]
        tickers.append(ticker)
    with open("sp500tickers.pickle","wb") as f:
        pickle.dump(tickers,f)
    return tickers

In [7]:
#save_sp500_tickers()

In [8]:
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers=save_sp500_tickers()
    else:
        with open("sp500tickers.pickle","rb") as f:
            tickers=pickle.load(f)
    if not os.path.exists("stock_dfs"):
        os.makedirs("stock_dfs")
    print(tickers)
    start=dt.datetime(2010,1,1)
    end=dt.datetime.now()
    for ticker in tickers:
        try:
            if not os.path.exists("stock_dfs/{}.csv".format(ticker)):
                df=web.DataReader(ticker,"yahoo",start,end)
                df.to_csv("stock_dfs/{}.csv".format(ticker))
        except:
            pass
        else:
            print("Already have {}".format(ticker))

In [9]:
# get_data_from_yahoo()

In [10]:
def compile_data():
    with open('sp500tickers.pickle',"rb") as f:
        tickers=pickle.load(f)
    main_df=pd.DataFrame()
    for count,ticker in enumerate(tickers):
        try:
            df=pd.read_csv('stock_dfs/{}.csv'.format(ticker))
            df.set_index('Date',inplace=True)
            df.rename(columns={'Adj Close':ticker},inplace=True)
            df.drop(['Open',"High","Low","Close","Volume"],1,inplace=True)
            if main_df.empty:
                main_df=df
            else:
                main_df=main_df.join(df,how='outer')
        except:
            pass
        if count%10==0:
            print(count)
    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')

In [11]:
#compile_data()

In [12]:
style.use('ggplot')
def visualize_data():
    df=pd.read_csv('sp500_joined_closes.csv')
    df_corr=df.corr()
    df_corr.to_csv('sp500corr.csv')
    data1=df_corr.values
    # Making fig and axis
    fig1=plt.figure()
    axis1=fig1.add_subplot(111)
    # Set Heatmap (Range of colors) Red to Green
    heatmap1=axis1.pcolor(data1,cmap=plt.cm.RdYlGn)
    fig1.colorbar(heatmap1)
    # Settimng companies ticks
    axis1.set_xticks(np.arange(data1.shape[1])+0.5,minor=False)
    axis1.set_yticks(np.arange(data1.shape[0])+0.5,minor=False)
    axis1.invert_yaxis()
    axis1.xaxis.tick_top()
    # Naming labels
    column_labels=df_corr.columns
    row_labels=df_corr.index
    axis1.set_yticklabels(column_labels)
    axis1.set_xticklabels(row_labels)
    # Rotating for better visualization
    plt.xticks(rotation=90)
    heatmap1.set_clim(-1,1)
    plt.tight_layout()
    plt.show()

In [13]:
#visualize_data()

In [14]:
def process_data_for_labels(ticker):
    hm_days=7
    df=pd.read_csv('sp500_joined_closes.csv',index_col=0)
    tickers=df.columns.values.tolist()
    df.fillna(0,inplace=True)
    for i in range(1,hm_days+1):
        df['{}_{}d'.format(ticker,i)]=(df[ticker].shift(-i)-df[ticker])/df[ticker]
    df.fillna(0,inplace=True)
    return(tickers,df)

In [15]:
def buy_sell_hold(*args):
    cols=[c for c in args]
    requirement=0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

In [25]:
def extract_featuresets(ticker):
    tickers,df=process_data_for_labels(ticker)
    df['{}_target'.format(ticker)]=list(map(buy_sell_hold,
                                            df['{}_1d'.format(ticker)],
                                           df['{}_2d'.format(ticker)],
                                           df['{}_3d'.format(ticker)],
                                           df['{}_4d'.format(ticker)],
                                           df['{}_5d'.format(ticker)],
                                           df['{}_6d'.format(ticker)],
                                           df['{}_7d'.format(ticker)]))
    vals=df['{}_target'.format(ticker)].values.tolist()
    str_vals=[str(i) for i in vals]
    print("Data Spread:",Counter(str_vals))
    # Cleaning
    df.fillna(0,inplace=True)
    df=df.replace([np.inf,-np.inf],np.nan)
    df.dropna(inplace=True)
    # Converting stock price into percent change
    df_vals=df[[ticker for ticker in tickers]].pct_change()
    df_vals=df_vals.replace([np.inf,-np.inf],0)
    df_vals.fillna(0,inplace=True)
    X=df_vals.values
    y=df['{}_target'.format(ticker)].values
    return(X,y,df)

In [31]:
#extract_featuresets('XOM')

In [32]:
from sklearn import model_selection,svm,neighbors
from sklearn.ensemble import VotingClassifier,RandomForestClassifier
def do_ml(ticker):
    X,y,df=extract_featuresets(ticker)
    X_train,X_test,Y_train,Y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=0)
    clf=VotingClassifier([('lsvc',svm.LinearSVC()),
                         ('knn',neighbors.KNeighborsClassifier()),
                         ('rfor',RandomForestClassifier())])
    clf.fit(X_train,Y_train)
    confidence=clf.score(X_test,Y_test)
    print('Accuracy:',confidence)
    predictions=clf.predict(X_test)
    print('Predicted Class Counts:',Counter(predictions))
    return(confidence)

In [28]:
do_ml('AAPL')

Data Spread: Counter({'1': 1179, '-1': 914, '0': 577})
Accuracy: 0.45318352059925093
Predicted Class Counts: Counter({1: 364, -1: 139, 0: 31})


0.45318352059925093

In [29]:
do_ml('XOM')

Data Spread: Counter({'0': 1101, '1': 831, '-1': 738})
Accuracy: 0.398876404494382
Predicted Class Counts: Counter({0: 388, -1: 78, 1: 68})


0.398876404494382

In [30]:
do_ml('ABT')

Data Spread: Counter({'1': 990, '0': 970, '-1': 710})
Accuracy: 0.40074906367041196
Predicted Class Counts: Counter({0: 234, 1: 190, -1: 110})


0.40074906367041196