In [1]:
import numpy as np
import requests
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from collections import Counter
import sys
from sklearn import svm, cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
style.use('ggplot')
#inline vs qt
%matplotlib inline

  from numpy.core.umath_tests import inner1d


## Process SP Data

In [2]:
def progressBar(value, endvalue, bar_length=20):

        percent = float(value) / endvalue
        arrow = '-' * int(round(percent * bar_length)-1) + '>'
        spaces = ' ' * (bar_length - len(arrow))

        sys.stdout.write("\rProgress: [{0}] {1}%".format(arrow + spaces, int(round(percent * 100))))
        sys.stdout.flush()

In [3]:
def make_individual():
    df = pd.read_csv('all_stocks_5yr.csv') # from kaggle.com
    save_to_folder = 'SP500_Individual_Data'
    set_of_tickers = set(df['Name'])
    for count, ticker in enumerate(set_of_tickers):
        individual_df = df.loc[df['Name'] == ticker].drop('Name', 1)
        individual_df = individual_df.reset_index(drop=True)
        individual_df.to_csv(save_to_folder + '/' + ticker + '.csv', index=False)
        progressBar(count, len(set_of_tickers))
        
#make_individual()

In [4]:
def compile_close_price_data():
    main_df = pd.DataFrame()
    set_of_tickers = set(pd.read_csv('all_stocks_5yr.csv')['Name'])
    for count, file_name in enumerate(os.listdir('SP500_Individual_Data')):
        df = pd.read_csv('SP500_Individual_Data/' + file_name)
        df.set_index('date', inplace=True)
        ticker = file_name.replace('.csv', '')
        df.rename(columns = {'close':ticker}, inplace=True)
        df.drop(['open', 'high', 'low', 'volume'], axis=1, inplace=True)
        
        if main_df.empty: 
            main_df = df
        else: 
            main_df = main_df.join(df, how='outer')
        progressBar(count, len(set_of_tickers))
    main_df.to_csv('sp500_joined_close.csv')     
        
#compile_close_price_data()

## SP500 Heatmap

In [5]:
def visualise_data():
    """Plots heatmap of correlation between SP500 companies using close price"""
    df = pd.read_csv('sp500_joined_close.csv')
    df_corr = df.corr()
    data = df_corr.values
    
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)  
    heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
    fig.colorbar(heatmap)
    ax.set_xticks(np.arange(data.shape[0]) + 0.5, minor=False)
    ax.set_yticks(np.arange(data.shape[1]) + 0.5, minor=False)
    ax.invert_yaxis()
    ax.xaxis.tick_top()
    
    column_labels = df_corr.columns
    row_labels = df_corr.index
    
    ax.set_xticklabels(column_labels)
    ax.set_yticklabels(row_labels)
    plt.xticks(rotation=90)
    heatmap.set_clim(-1, 1)
    plt.tight_layout()
    plt.show()
    
#visualise_data()

## ML Model

In [6]:
def process_data_for_labels(ticker):
    """
    Returns list of tickers and a df with %change projections of ticker price up to hm_days
    
    Keyword arguments:
    ticker -- string of the ticker you want future projections for
    """
    hm_days = 7
    df = pd.read_csv('sp500_joined_close.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)
    
    for i in range(1, hm_days + 1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
        
    df.fillna(0, inplace=True)
    df.to_csv('aapl.csv')
    # print(df.isnull().sum().sum())
    return tickers, df
process_data_for_labels('AAPL')

(['A',
  'AAL',
  'AAP',
  'AAPL',
  'ABBV',
  'ABC',
  'ABT',
  'ACN',
  'ADBE',
  'ADI',
  'ADM',
  'ADP',
  'ADS',
  'ADSK',
  'AEE',
  'AEP',
  'AES',
  'AET',
  'AFL',
  'AGN',
  'AIG',
  'AIV',
  'AIZ',
  'AJG',
  'AKAM',
  'ALB',
  'ALGN',
  'ALK',
  'ALL',
  'ALLE',
  'ALXN',
  'AMAT',
  'AMD',
  'AME',
  'AMG',
  'AMGN',
  'AMP',
  'AMT',
  'AMZN',
  'ANDV',
  'ANSS',
  'ANTM',
  'AON',
  'AOS',
  'APA',
  'APC',
  'APD',
  'APH',
  'APTV',
  'ARE',
  'ARNC',
  'ATVI',
  'AVB',
  'AVGO',
  'AVY',
  'AWK',
  'AXP',
  'AYI',
  'AZO',
  'BA',
  'BAC',
  'BAX',
  'BBT',
  'BBY',
  'BDX',
  'BEN',
  'BF.B',
  'BHF',
  'BHGE',
  'BIIB',
  'BK',
  'BLK',
  'BLL',
  'BMY',
  'BRK.B',
  'BSX',
  'BWA',
  'BXP',
  'C',
  'CA',
  'CAG',
  'CAH',
  'CAT',
  'CB',
  'CBG',
  'CBOE',
  'CBS',
  'CCI',
  'CCL',
  'CDNS',
  'CELG',
  'CERN',
  'CF',
  'CFG',
  'CHD',
  'CHK',
  'CHRW',
  'CHTR',
  'CI',
  'CINF',
  'CL',
  'CLX',
  'CMA',
  'CMCSA',
  'CME',
  'CMG',
  'CMI',
  'CMS',
  'CNC'

In [7]:
def buy_sell_hold(*args): 
    """
    Returns:
        1: buy
        -1: sell
        0: hold
        
    Keyword arguments:
    *args -- 
    """
    requirement = 0.025
    cols = [c for c in args]
    for col in cols: 
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

In [8]:
def extract_feature_set(ticker):
    """
    Returns feature set as list, target, and feature set as df
    
    Keyword arguments:
    ticker -- ticker of the company whose feature set you want
    """
    tickers, df = process_data_for_labels(ticker)
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                                df['{}_1d'.format(ticker)],
                                                df['{}_2d'.format(ticker)],
                                                df['{}_3d'.format(ticker)],
                                                df['{}_4d'.format(ticker)],
                                                df['{}_5d'.format(ticker)],
                                                df['{}_6d'.format(ticker)],
                                                df['{}_7d'.format(ticker)]
                                             ))
    
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:', Counter(str_vals))
    
    df.fillna(0, inplace=True)
    # if something has price of 0, then price of 1, % change is infinite
    df = df.replace([np.inf, -np.inf], np.nan) 
    df.dropna(inplace=True)
    
    # %change for all companies (including ticker in question)
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)
    
    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    
    return X, y, df
# extract_feature_set('XOM')

In [9]:
def do_ml(ticker): 
    """
    Returns the confidence of our model
    
    Keyword arguments:
    ticker -- ticker for which we want to generate a ML model 
    """
    X, y, df = extract_feature_set(ticker)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)
    
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])
    
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy:', confidence)
    
    predictions = clf.predict(X_test)
    print('Predicted spread:', Counter(predictions))
    return confidence

do_ml('D')

Data spread: Counter({'0': 680, '1': 318, '-1': 261})


  if diff:


Accuracy: 0.5515873015873016
Predicted spread: Counter({0: 227, -1: 15, 1: 10})


  if diff:


0.5515873015873016