## Goal of this Notebook
Look at the R value of the two and three day linear models

In [1]:
from stock_utils import *

### 1. Define Linear Models 

In [2]:
class ThreeDayLinearModel:
    trained=False
    verbose=False
    mean_movements_after_category = {}
    
    def __init__(self, data=None):
        if self.verbose: print('Initializing Model...')
        if data is not None:
            self.train(data)
        else:
            print('Inintializing without training data')
            
    def __str__(self):
        return 'Three Day Linear Model'

    def train(self, movement_categories, daily_movements):
        if self.verbose: print('Training...')
        three_day_trends_linear = get_trends_linear(movement_categories, daily_movements, 3)
        all_cats = ['bd', 'sd', 'sg', 'bg']
        movements_after_category = []
        for cat1 in all_cats:
            for cat2 in all_cats:
                movements_after_category.append(get_movements_after_trend(cat1 + '_' + cat2, three_day_trends_linear))
            
        self.mean_movements_after_category['bd_bd'] = np.mean(np.asarray(movements_after_category[0]))
        self.mean_movements_after_category['bd_sd'] = np.mean(np.asarray(movements_after_category[1]))
        self.mean_movements_after_category['bd_sg'] = np.mean(np.asarray(movements_after_category[2]))
        self.mean_movements_after_category['bd_bg'] = np.mean(np.asarray(movements_after_category[3]))
        
        self.mean_movements_after_category['sd_bd'] = np.mean(np.asarray(movements_after_category[4]))
        self.mean_movements_after_category['sd_sd'] = np.mean(np.asarray(movements_after_category[5]))
        self.mean_movements_after_category['sd_sg'] = np.mean(np.asarray(movements_after_category[6]))
        self.mean_movements_after_category['sd_bg'] = np.mean(np.asarray(movements_after_category[7]))
        
        self.mean_movements_after_category['sg_bd'] = np.mean(np.asarray(movements_after_category[8]))
        self.mean_movements_after_category['sg_sd'] = np.mean(np.asarray(movements_after_category[9]))
        self.mean_movements_after_category['sg_sg'] = np.mean(np.asarray(movements_after_category[10]))
        self.mean_movements_after_category['sg_bg'] = np.mean(np.asarray(movements_after_category[11]))
        
        self.mean_movements_after_category['bg_bd'] = np.mean(np.asarray(movements_after_category[12]))
        self.mean_movements_after_category['bg_sd'] = np.mean(np.asarray(movements_after_category[13]))
        self.mean_movements_after_category['bg_sg'] = np.mean(np.asarray(movements_after_category[14]))
        self.mean_movements_after_category['bg_bg'] = np.mean(np.asarray(movements_after_category[15]))
        
        self.trained=True
        
    def predict(self, input_sequence, raw=False):
        predictions= np.zeros(len(input_sequence) - 1)
        for i in range(len(input_sequence) - 1):
            predictions[i] = self.mean_movements_after_category[input_sequence[i] + '_' + input_sequence[i+1]]
        return predictions

In [3]:
class TwoDayLinearModel:
    trained=False
    verbose=False
    mean_movements_after_category = {}

    def __init__(self, data=None):
        if self.verbose: print('Initializing Model...')
        if data is not None:
            self.train(data)
        else:
            print('Inintializing without training data')
            
    def __str__(self):
        return 'Local Two Day Model'

    def train(self, movement_categories, daily_movements):
        if self.verbose: print('Training...')
        two_day_trends_linear = get_trends_linear(movement_categories, daily_movements, 2)
        all_cats = ['bd', 'sd', 'sg', 'bg']
        movements_after_category = []
        for cat in all_cats:
            movements_after_category.append(get_movements_after_trend(cat, two_day_trends_linear))
            
        self.mean_movements_after_category['bd'] = np.mean(np.asarray(movements_after_category[0]))
        self.mean_movements_after_category['sd'] = np.mean(np.asarray(movements_after_category[1]))
        self.mean_movements_after_category['sg'] = np.mean(np.asarray(movements_after_category[2]))
        self.mean_movements_after_category['bg'] = np.mean(np.asarray(movements_after_category[3]))
        self.trained=True
        
    def predict(self, input_sequence, raw=False):
        predictions= np.zeros(len(input_sequence))
        for i in range(len(input_sequence)):
            predictions[i] = self.mean_movements_after_category[input_sequence[i]]
        return predictions

#### Split into training and verification, train models on training data

In [4]:
df = pd.DataFrame()
df = df.from_csv('stock_data/spy.csv')

daily_movements = get_price_movement_percentages(df)
movement_categories = categorize_movements(daily_movements, n_cats=4)

period_len = int(len(daily_movements) / 5)
train_daily_movements = daily_movements[0:4*period_len]
valid_daily_movements = daily_movements[4*period_len+1:5*period_len]

train_movement_categories = movement_categories[0:4*period_len]
valid_movement_categories = movement_categories[4*period_len+1:5*period_len]

In [5]:
two_L = TwoDayLinearModel()
two_L.train(train_movement_categories, train_daily_movements)

three_L = ThreeDayLinearModel()
three_L.train(train_movement_categories, train_daily_movements)

Inintializing without training data
Inintializing without training data


In [6]:
print(two_L.predict(['bg', 'bd']))
print(three_L.predict(['bg', 'bd', 'bd']))

[-0.1318665   0.22140209]
[-0.5647466   0.94179326]


In [7]:
np.mean(valid_daily_movements)

0.052737318559556379

### 2. Get Linear Model Predictions on validation set

In [8]:
two_L_predictions = two_L.predict(valid_movement_categories)
three_L_predictions = three_L.predict(valid_movement_categories) ## Contains 1 less prediction then the above

In [9]:
baseline_prediction_1 = np.full(len(two_L_predictions), np.mean(train_daily_movements))
baseline_prediction_2 = np.full(len(two_L_predictions), np.mean(valid_daily_movements))

### 3. Get R squared values for two day and three day linear models on SPY
#### Verify R squared working correctly by predicting the mean movement, and verifying it should be close to 0

In [24]:
def get_r_squared(y, y_hat):
    
    if (y_hat.size != y.size):
        raise ValueError("Size of y and y_hat don't match")
        
    ## Need y_hat, y_bar, y
    y_bar = np.mean(y)
    squared_residual_variance = np.zeros(y.size)
    squared_total_variance = np.zeros(y.size)
    
    for i in range(y.size):
        squared_residual_variance[i] = (y_hat[i] - y[i]) ** 2
        squared_total_variance[i] = (y_bar - y[i]) ** 2
        
    SSR = np.sum(squared_residual_variance)
    SST = np.sum(squared_total_variance)
    
    r_squared = (1 - (SSR / SST))
    return r_squared

In [25]:
r2_two_L = get_r_squared(valid_daily_movements[1:], two_L_predictions[:-1])
r2_three_L = get_r_squared(valid_daily_movements[2:], three_L_predictions[:-1])

r2_b1 = get_r_squared(valid_daily_movements[1:], baseline_prediction_1[:-1])
r2_b2 = get_r_squared(valid_daily_movements[1:], baseline_prediction_2[:-1])

print('Two Day Linear Model: r squared is ' + '{0:.4f}'.format(r2_two_L) + '. r value is ' + '{0:.4f}'.format((np.sqrt(r2_two_L))))
print('Three Day Linear Model: r squared is ' + '{0:.4f}'.format(r2_three_L)  + '. r value is ' + '{0:.4f}'.format((np.sqrt(r2_three_L))))
print()
print('Baseline 1 (mean of training data): r squared is ' + '{0:.4f}'.format(r2_b1))
print('Baseline 2 (mean of validation data): r squared is ' + '{0:.4f}'.format(r2_b2))

Two Day Linear Model: r squared is 0.0073. r value is 0.0852
Three Day Linear Model: r squared is 0.0108. r value is 0.1038

Baseline 1 (mean of training data): r squared is -0.0015
Baseline 2 (mean of validation data): r squared is -0.0000


In [30]:
## Confirm
from sklearn.metrics import r2_score

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print(r2_score(y_true, y_pred))
print()

print(r2_score(valid_daily_movements[1:], two_L_predictions[:-1]))
print(r2_score(valid_daily_movements[2:], three_L_predictions[:-1]))

0.948608137045

0.00726397187253
0.0107837674562


## Design Flaws
#### In real life scenarios, we cannot catogorize all the daily movements together
So we would have to categorize our training data, then categorize each new movement is they came in for prediction use

## TODO implement this