In [157]:
#import models
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import StockMetrics

In [158]:
## bring in final clean dataset

import os
importpath = r'C:\Users\nmur1\Google Drive\Springboard\Capstone2\CleanData'
os.chdir(importpath)
dfmodel = pd.read_csv('FinalModelingSet.csv', index_col = None).drop(columns = 'Unnamed: 0')
dfmodel.shape

(566789, 26)

In [161]:
dfmodel.Date.max()

'2020-08-14'

In [162]:
dfmodel[dfmodel.loc[:,:'E_Season'].duplicated()]
duped = dfmodel.loc[:,:'E_Season'].columns
dfmodel = dfmodel.drop_duplicates(subset = duped)
dfmodel.shape

(566789, 26)

In [163]:
y = dfmodel['Thirty_Day'].ravel()
X = dfmodel.drop(columns = [ 'Thirty_Day', 'Sixty_Day', 'Five_Day', 'Date',
                            'Sector','DI_Plus_R', 'Sales', 'Percent_Buy', 'E_Season', 'Pos_Coef'])

In [164]:


model = RandomForestClassifier(100, n_jobs = -1, random_state = 42, min_samples_leaf = 1, criterion = "gini", min_samples_split = 2, bootstrap = False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 42)

scaler = StandardScaler().fit(X_train.drop(columns = ['Ticker']))
X_scale = scaler.transform(X_train.drop(columns = ['Ticker']))


model.fit(X_scale,y_train)


RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [165]:
ypred = model.predict(scaler.transform(X_test.drop(columns = 'Ticker')))
yprob = model.predict_proba(scaler.transform(X_test.drop(columns = 'Ticker')))
accuracy_score(y_test, ypred)

0.9267879574870499

In [166]:
probno = []
probyes = []
for y, x in yprob:
    probno.append(y)
    probyes.append(x)

In [167]:
TestResults = pd.DataFrame(zip(X_test.Ticker, y_test,ypred, probno, probyes), columns = ['Ticker', 'Actual', 'Predicted', 'ProbN', 'ProbY'])

TestResults['Accuracy'] = 1 - abs(TestResults.Actual - TestResults.Predicted)
TestSummary = TestResults.groupby('Ticker').Accuracy.mean()
TestResults.to_csv('TestResults.csv')

## Input new data and drop columns not used in model

In [172]:
model_input = pd.read_csv('08_29_Model Input.csv')
drop = ['Unnamed: 0', 'index','Close','UpperB', 'LowerB', 'EMA','SMA', 'AverageSectorPE', 'AverageSectorSR']

In [173]:
# pull up Netflix and spot check techn indicators for accuracy
model_input[model_input.Ticker == 'NFLX']

Unnamed: 0.1,Unnamed: 0,index,Date,Close,DI_Plus,ADX,CMF,DI_Plus_Slope,DI_Plus_R,SMA,...,Sales_Ratio,AverageSectorSR,Relative_SR,E_Season,Five_Day,Thirty_Day,Sixty_Day,Pos_Coef,EMA,SignalVar
49759,61356,19070,2020-03-20,332.829987,15.772029,23.287133,-0.025247,-0.755776,0.653403,351.756998,...,7.638850,3.019982,1.529436,0.0,1,1,1,-0.493826,0.000000,0.000000
49760,61357,19071,2020-03-23,360.269989,18.931702,21.698232,0.003184,-0.510797,0.344555,351.335497,...,7.629696,3.019982,1.526405,0.0,0,1,1,-0.175998,0.000000,0.000000
49761,61358,19072,2020-03-24,357.320007,19.871869,20.472651,-0.006597,-0.279199,0.106357,351.196997,...,7.626689,3.019982,1.525409,0.0,1,1,1,-0.029695,0.000000,0.000000
49762,61359,19073,2020-03-25,342.390015,18.473042,19.491547,-0.071164,-0.042338,0.002902,349.354498,...,7.586677,3.019982,1.512160,0.0,1,1,1,-0.000123,0.000000,0.000000
49763,61360,19074,2020-03-26,362.989990,17.778763,18.464724,0.015357,0.135646,0.033154,348.918498,...,7.577208,3.019982,1.509025,0.0,1,1,1,0.004497,351.159998,1.006424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49866,61463,19177,2020-08-21,492.309998,21.368306,12.607427,-0.065883,-0.417310,0.381347,489.812498,...,10.636900,3.019982,2.522174,0.0,0,0,0,-0.159140,490.350705,1.001099
49867,61464,19178,2020-08-24,488.809998,19.511251,11.745496,-0.153298,-0.308947,0.296458,489.470499,...,10.629473,3.019982,2.519715,0.0,0,0,0,-0.091590,489.837136,1.000749
49868,61465,19179,2020-08-25,490.579987,18.783992,10.945132,-0.071880,-0.217490,0.222706,489.573997,...,10.631721,3.019982,2.520459,0.0,0,0,0,-0.048436,490.084753,1.001043
49869,61466,19180,2020-08-26,547.530029,37.598120,13.348606,0.168443,0.402321,0.128519,492.726498,...,10.700182,3.019982,2.543128,0.0,0,0,0,0.051706,509.233178,1.033501


In [176]:
#drop columns 
model_input = model_input.drop(columns = drop)
model_input = model_input[(model_input.Date >= '2020-08-20') & (model_input.Date <='2020-08-28')]

In [177]:
mod_y = model_input['Thirty_Day'].ravel()
mod_X = model_input.drop(columns = [ 'Thirty_Day', 'Sixty_Day', 'Five_Day', 'Date',
                            'Sector','DI_Plus_R', 'Sales', 'Percent_Buy', 'E_Season', 'Pos_Coef'])

In [178]:
scaler = StandardScaler().fit(mod_X.drop(columns = ['Ticker']))
X_scale = scaler.transform(mod_X.drop(columns = ['Ticker']))

modpred = model.predict(X_scale)
modprob = model.predict_proba(X_scale)

probno = []
probyes = []
for y, x in yprob:
    probno.append(y)
    probyes.append(x)

In [179]:
Results = pd.DataFrame(zip(mod_X.Ticker, modpred, probno, probyes), columns = ['Ticker', 'Predicted', 'ProbN', 'ProbY'])

In [188]:
ResultsSum = Results.groupby('Ticker').mean()
ResultsSum.Predicted = ResultsSum.ProbY.apply(lambda x: 1 if x > .5 else 0)
ResultsSum

Unnamed: 0_level_0,Predicted,ProbN,ProbY
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,0.148333,0.851667
AAL,1,0.288333,0.711667
AAP,0,0.603333,0.396667
AAPL,1,0.398333,0.601667
ABBV,1,0.270000,0.730000
...,...,...,...
YUM,1,0.396667,0.603333
ZBH,1,0.246667,0.753333
ZBRA,1,0.461667,0.538333
ZION,1,0.288333,0.711667


In [185]:
TestSummary = TestSummary.reset_index()
TestSummary.columns = ['Ticker', 'Test Accuracy']

In [186]:
Results_Final = ResultsSum.merge(TestSummary, on = 'Ticker')
Results_Final.to_excel('08_27_Predicitons.xlsx')