In [1]:
from collections import Counter

In [2]:
import numpy as np
import pandas as pd
import pickle

In [3]:
from sklearn import svm, neighbors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

In [7]:
def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0,inplace=True)
    
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
        
    df.fillna(0,inplace=True)
    return tickers, df

process_data_for_labels('AAPL')

(['MMM',
  'ABT',
  'ABBV',
  'ABMD',
  'ACN',
  'ATVI',
  'ADBE',
  'AMD',
  'AAP',
  'AES',
  'AMG',
  'AFL',
  'A',
  'APD',
  'AKAM',
  'ALK',
  'ALB',
  'ARE',
  'ALXN',
  'ALGN',
  'ALLE',
  'AGN',
  'ADS',
  'LNT',
  'ALL',
  'GOOGL',
  'GOOG',
  'MO',
  'AMZN',
  'AMCR',
  'AEE',
  'AAL',
  'AEP',
  'AXP',
  'AIG',
  'AMT',
  'AWK',
  'AMP',
  'ABC',
  'AME',
  'AMGN',
  'APH',
  'ADI',
  'ANSS',
  'ANTM',
  'AON',
  'AOS',
  'APA',
  'AIV',
  'AAPL'],
                    MMM        ABT       ABBV        ABMD         ACN  \
 Date                                                                   
 2009-12-31   64.376617  20.377853   0.000000    8.730000   33.685593   
 2010-01-04   64.649193  20.555250   0.000000    8.740000   34.148254   
 2010-01-05   64.244255  20.389175   0.000000    8.530000   34.359306   
 2010-01-06   65.155350  20.502405   0.000000    8.400000   34.724575   
 2010-01-07   65.202065  20.672255   0.000000    8.400000   34.692101   
 ...                ... 

In [10]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.05 #if stock price changes by 5%
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0
    

In [11]:
def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)
    
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                             df['{}_1d'.format(ticker)],
                                             df['{}_2d'.format(ticker)],
                                             df['{}_3d'.format(ticker)],
                                             df['{}_4d'.format(ticker)],
                                             df['{}_5d'.format(ticker)],
                                             df['{}_6d'.format(ticker)],
                                             df['{}_7d'.format(ticker)]
                                             ))
    
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:',Counter(str_vals))
    
    df.fillna(0,inplace=True)
    df = df.replace([np.inf,-np.inf], np.nan)
    df.dropna(inplace=True)
    
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf,-np.inf], 0)
    df_vals.fillna(0,inplace=True)
    
    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    
    return X, y, df

extract_featuresets('AAPL')

Data spread: Counter({'1': 1020, '-1': 854, '0': 391})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00423407,  0.00870537,  0.        , ...,  0.02617034,
          0.00251249,  0.01556463],
        [-0.00626362, -0.00807943,  0.        , ...,  0.01180706,
          0.02506256,  0.00172911],
        ...,
        [ 0.02383763,  0.01450722,  0.00977089, ..., -0.01109465,
          0.0020723 , -0.0064897 ],
        [-0.00697424,  0.00651279,  0.013458  , ..., -0.01495893,
         -0.00091921,  0.00051225],
        [ 0.00617838,  0.01744283,  0.01174275, ..., -0.0034169 ,
          0.00919971,  0.00966532]]),
 array([ 0, -1, -1, ...,  0,  0,  0], dtype=int64),
                    MMM        ABT       ABBV        ABMD         ACN  \
 Date                                                                   
 2009-12-31   64.376617  20.377853   0.000000    8.730000   33.685593   
 2010-01-04   64.649193  20.555250   0.000000    8.740000   34.148254   
 2010-01-05   64.244255  20.3

In [13]:
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                       y,
                                                       test_size=0.25)
    
    #clf = neighbors.KNeighborsClassifier()
    
    clf = VotingClassifier([('lsvc',svm.LinearSVC()),
                           ('knn', neighbors.KNeighborsClassifier()),
                           ('rfor', RandomForestClassifier())
                           ])
    
    
    
    clf.fit(X_train,y_train)
    confidence = clf.score(X_test, y_test)
    print('Accuracy:',confidence)
    predictions = clf.predict(X_test)
    print('Predicted spread:', Counter(predictions))
    
    return confidence

do_ml('AAPL')

Data spread: Counter({'1': 1020, '-1': 854, '0': 391})




Accuracy: 0.43033509700176364
Predicted spread: Counter({1: 357, -1: 198, 0: 12})


0.43033509700176364