# Single Stock Classification and Incremental Learning
Use a window of length k to predict whether one should buy or sell a certain stock

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

Parameters for stock analysis

In [2]:
stock = 'AAL'
window_size = 90

## Load Data

In [3]:
path = f'../data/individual_stocks_5yr/{stock}_data.csv'

In [4]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


## Create Training and Testing Data

In [5]:
closings = df.drop(['date', 'open', 'high', 'low','volume', 'Name'], axis=1).values
openings = df.drop(['date', 'close', 'high', 'low','volume', 'Name'], axis=1).values

In [6]:
X, y_reg, y_class = [],[],[]

for i in range(closings.shape[0]-window_size):
    end = i + window_size
    window = openings[i:end]
    X.append(window.flatten())
    y_reg.append(openings[end])
    y_class.append( 1 if openings[end] > closings[end - 1] else 0)
X,y_reg, y_class = np.array(X), np.array(y_reg), np.array(y_class)

print(X.shape, y_reg.shape, y_class.shape)

(1169, 90) (1169, 1) (1169,)


# We can create various classifiers trained incrementally in different schemes
Initially, we train the model on all of the data and explore the results

In [7]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as cm

In [8]:
def train(X, y, subset=None, loss='log'):
    print(loss)
    subset = X.shape[0] if subset is None else subset 
    data = X[:subset]
    labels = y[:subset]
    model = linear_model.SGDClassifier(loss=loss) 
    model.fit(data, labels)
    return model

In [9]:
for X in (X,):
    for y in (y_class,):
        X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.20,
                                                    random_state=123)
        for loss in [('log', 'Logistic Regression'), ('hinge', 'Linear SVM')]:

            model = train(X_train,y_train,loss=loss[0])
            
            print(f"Training Accuracy for {loss[1]}")
            print(model.score(X_train,y_train))
            print(cm(y_train, model.predict(X_train))) 
            
            print(f"Testing Accuracy for {loss[1]}")
            print(model.score(X_test,y_test))
            print(cm(y_test, model.predict(X_test)))

       

log
Training Accuracy for Logistic Regression
0.46203208556149733
[[432   0]
 [503   0]]
Testing Accuracy for Logistic Regression
0.4230769230769231
[[ 99   0]
 [135   0]]
hinge
Training Accuracy for Linear SVM
0.46203208556149733
[[432   0]
 [503   0]]
Testing Accuracy for Linear SVM
0.4230769230769231
[[ 99   0]
 [135   0]]




## 1. Sequentially introduce new data to the model

## 2. Randomly introduce new data to the model

## 3. Completey retrain model with new batch of data 