In [31]:
import pandas as pd
import numpy as np
import pickle

In [32]:
def processData (ticker):
    hm_days = 7
    df = pd.read_csv('../data/sp500_joined_closes.csv', index_col= 0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace = True)
    
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]
        
    df.fillna(0, inplace = True)
    return tickers, df

In [33]:
t,d = processData('AAPL')

In [34]:
dx = d[['AAPL', 'AAPL_1d', 'AAPL_2d']]
dx.head()

Unnamed: 0_level_0,AAPL,AAPL_1d,AAPL_2d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-01-03,10.857091,0.022196,0.014916
2007-01-04,11.098071,-0.007121,-0.002218
2007-01-05,11.019039,0.004938,0.088419
2007-01-08,11.073455,0.08307,0.134901
2007-01-09,11.993328,0.047856,0.034892


In [35]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    retVal = 0
    for col in cols:
        if col > requirement:
            retVal = 1
        if col < -requirement:
            retVal = -1
    return retVal

In [36]:
from collections import Counter

In [37]:
def extract_features(ticker):
    tickers, df = processData(ticker)
    hm_days = 7
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold, 
                                              *[df['{}_{}d'.format(ticker, i)] for i in range(1, hm_days+1)]
                                             ))
    
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print("Data spread:", Counter(str_vals))
    
    df.fillna(0, inplace = True)
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace = True)
    
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace = True)
    
    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    
    return X, y, df

In [41]:
X, y, df = extract_features('AAPL')

Data spread: Counter({'1': 1255, '-1': 952, '0': 311})


In [42]:
from sklearn import svm, cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier



In [46]:
def buildModel(ticker):
    X, y, df = extract_features(ticker)
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size= 0.25)
    
#     clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([('linearSVC', svm.LinearSVC()), 
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('randomForest', RandomForestClassifier())])
                                                             
    clf.fit(X_train, y_train)
    
    confidence = clf.score(X_test, y_test)
    print("Accuracy = ", confidence)
    
    preds = clf.predict(X_test)
    print("Predictions Spread", Counter(preds))
    
    return confidence 

In [50]:
buildModel('GOOG')

Data spread: Counter({'1': 1140, '-1': 944, '0': 434})
Accuracy =  0.401587301587
Predictions Spread Counter({1: 373, -1: 245, 0: 12})


0.4015873015873016