In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import time
import sklearn

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import preprocessing

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree

In [110]:
def model_acc_time(X_train, X_test, y_train, y_test, model):
    # creates a model based on training data and returns the accuracy on the held out test set and the time it took to train the model

    # data should be pre processed already
    # records the time and accuracy for each model, as well as saving the model itself
    total_time = []
    accuracy = []
    models = []

    # manual 5 fold on test data to select the best model
    for i in range(5):
        # splitting the training data to train and evaluate the model
        kfX_train, kfX_test, kfy_train, kfy_test = train_test_split(
    X_train, y_train, test_size=0.2,shuffle=True)

        # creates a new unfitted model with the inputted parameters
        curModel = sklearn.base.clone(model)


        # times the training of the model and calcualtes the accuracy on the validation set
        t1 = time.time()
        curModel.fit(kfX_train,kfy_train)
        accuracy.append((curModel.predict(kfX_test)==np.array(kfy_test)[0]).mean())
        t2 = time.time()
        models.append(curModel)
        total_time.append(t2-t1)

    # selects the model with the highest accuracy on the validation set and calcuates the accuracy on the held out set
    best_index = accuracy.index(max(accuracy))
    t = total_time[best_index]
    most_acc_model = models[best_index]
    acc = (most_acc_model.predict(X_test)==np.array(y_test)[0]).mean()*100

    return "Accuracy: {0:.2f}% || Time to Train: {1:.3f} seconds".format(acc, t)

In [7]:
bitcoin = pd.read_csv("data/btc.csv")
bitcoin

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2016-02-01,369.350006,378.071991,367.957001,373.056000,373.056000,5.165670e+07
1,2016-02-02,372.920013,375.882996,372.920013,374.447998,374.447998,4.037870e+07
2,2016-02-03,374.645996,374.950012,368.045013,369.949005,369.949005,4.593340e+07
3,2016-02-04,370.174011,391.608002,369.993011,389.593994,389.593994,6.928550e+07
4,2016-02-05,388.898010,391.093994,385.571991,386.549011,386.549011,4.382500e+07
...,...,...,...,...,...,...,...
1823,2021-01-28,30441.041016,31891.300781,30023.207031,31649.605469,31649.605469,7.894816e+10
1824,2021-01-29,34318.671875,38406.261719,32064.814453,34316.386719,34316.386719,1.178946e+11
1825,2021-01-30,34295.933594,34834.707031,32940.187500,34269.523438,34269.523438,6.514183e+10
1826,2021-01-31,34270.878906,34288.332031,32270.175781,33114.359375,33114.359375,5.275454e+10


In [8]:
bitcoin["increase"] = bitcoin.Close > bitcoin.Open
bitcoin.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,increase
0,2016-02-01,369.350006,378.071991,367.957001,373.056,373.056,51656700.0,True
1,2016-02-02,372.920013,375.882996,372.920013,374.447998,374.447998,40378700.0,True
2,2016-02-03,374.645996,374.950012,368.045013,369.949005,369.949005,45933400.0,False
3,2016-02-04,370.174011,391.608002,369.993011,389.593994,389.593994,69285504.0,True
4,2016-02-05,388.89801,391.093994,385.571991,386.549011,386.549011,43825000.0,False


In [9]:
btc = bitcoin.drop(["Date","Adj Close", "Close"], axis=1)
btc

Unnamed: 0,Open,High,Low,Volume,increase
0,369.350006,378.071991,367.957001,5.165670e+07,True
1,372.920013,375.882996,372.920013,4.037870e+07,True
2,374.645996,374.950012,368.045013,4.593340e+07,False
3,370.174011,391.608002,369.993011,6.928550e+07,True
4,388.898010,391.093994,385.571991,4.382500e+07,False
...,...,...,...,...,...
1823,30441.041016,31891.300781,30023.207031,7.894816e+10,True
1824,34318.671875,38406.261719,32064.814453,1.178946e+11,False
1825,34295.933594,34834.707031,32940.187500,6.514183e+10,False
1826,34270.878906,34288.332031,32270.175781,5.275454e+10,False


In [10]:
bitcoin.increase.mean()

0.5579868708971554

In [11]:
# save as a csv
btc.to_csv("bitcoin.csv")

In [12]:
# need to adjust the dataset because we only know the data for the previous days and not the ones before
# the increase column will be shifted one up, because if the price increases, it should be dependent on the previous day

In [13]:
# only using the previous day to determine of the the price of bitcoin will go up

In [14]:
# drop the first increase, because we do not have data for it and shift it one up
inc = btc.increase[1:].reset_index(drop=True)
inc

0        True
1       False
2        True
3       False
4       False
        ...  
1822     True
1823    False
1824    False
1825    False
1826    False
Name: increase, Length: 1827, dtype: bool

In [15]:
# drop last row for last day
btc_one_day = btc.iloc[:-1,:]
btc_one_day

Unnamed: 0,Open,High,Low,Volume,increase
0,369.350006,378.071991,367.957001,5.165670e+07,True
1,372.920013,375.882996,372.920013,4.037870e+07,True
2,374.645996,374.950012,368.045013,4.593340e+07,False
3,370.174011,391.608002,369.993011,6.928550e+07,True
4,388.898010,391.093994,385.571991,4.382500e+07,False
...,...,...,...,...,...
1822,32564.029297,32564.029297,29367.138672,6.257676e+10,False
1823,30441.041016,31891.300781,30023.207031,7.894816e+10,True
1824,34318.671875,38406.261719,32064.814453,1.178946e+11,False
1825,34295.933594,34834.707031,32940.187500,6.514183e+10,False


In [16]:
btc_one_day.increase = inc
btc_one_day

Unnamed: 0,Open,High,Low,Volume,increase
0,369.350006,378.071991,367.957001,5.165670e+07,True
1,372.920013,375.882996,372.920013,4.037870e+07,False
2,374.645996,374.950012,368.045013,4.593340e+07,True
3,370.174011,391.608002,369.993011,6.928550e+07,False
4,388.898010,391.093994,385.571991,4.382500e+07,False
...,...,...,...,...,...
1822,32564.029297,32564.029297,29367.138672,6.257676e+10,True
1823,30441.041016,31891.300781,30023.207031,7.894816e+10,False
1824,34318.671875,38406.261719,32064.814453,1.178946e+11,False
1825,34295.933594,34834.707031,32940.187500,6.514183e+10,False


In [17]:
btc_one_day = btc_one_day.dropna(axis=0)

In [18]:
btc_one_day.to_csv("oneDayBTC.csv")

### models for the one day slider

In [98]:
#Split Data into training and testing
x1 = btc_one_day[["Open","High","Low","Volume"]]
y1 = btc_one_day[["increase"]]
#Split Data into training and testing
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.2)

In [111]:
# DT classifier, using entropy and a max depth of 2
DT = tree.DecisionTreeClassifier(criterion="entropy",max_depth=2)

# using the function above, we can determine the accuracy and training time of a deciosion tree model
model_acc_time(X_train1, X_test1, y_train1, y_test1, DT)

              Open          High           Low        Volume
1608   9167.824219   9207.810547   8998.216797  1.727309e+10
359     891.924011    903.252014    891.687012  1.208310e+08
1051   3544.761475   3701.349365   3487.169189  5.911325e+09
134     704.504028    704.504028    662.804016  1.866940e+08
1724  11913.077148  13184.566406  11900.928711  4.341471e+10
1210   8055.206055   8687.520508   7924.670410  2.667797e+10
1655  11588.405273  11796.396484  11216.872070  2.752220e+10
1006   6387.240234   6400.069824   6342.370117  3.658640e+09
15      401.432007    408.945007    401.432007  7.309310e+07
633    5747.950195   5976.799805   5721.220215  1.905040e+09


'Accuracy: 91.51% || Time to Train: 0.003 seconds'

In [109]:
# MLP Classifier 
MLP = MLPClassifier(max_iter=300, hidden_layer_sizes=(3,3), solver="sgd")

# calculating the accuracy and training time of an MLP Classifier
model_acc_time(X_train1, X_test1, y_train1, y_test1, MLP)

'Accuracy: 100.00% || Time to Train: 0.027 seconds'

In [112]:
# Random Forest
RF = RandomForestClassifier(n_estimators=200, max_depth=4,criterion="entropy")

# calculating the accuracy and training time of the random forest Classifier
model_acc_time(X_train1, X_test1, y_train1, y_test1, RF)

              Open          High           Low        Volume
1608   9167.824219   9207.810547   8998.216797  1.727309e+10
359     891.924011    903.252014    891.687012  1.208310e+08
1051   3544.761475   3701.349365   3487.169189  5.911325e+09
134     704.504028    704.504028    662.804016  1.866940e+08
1724  11913.077148  13184.566406  11900.928711  4.341471e+10
1210   8055.206055   8687.520508   7924.670410  2.667797e+10
1655  11588.405273  11796.396484  11216.872070  2.752220e+10
1006   6387.240234   6400.069824   6342.370117  3.658640e+09
15      401.432007    408.945007    401.432007  7.309310e+07
633    5747.950195   5976.799805   5721.220215  1.905040e+09


'Accuracy: 92.88% || Time to Train: 0.348 seconds'

In [113]:
# Extra Trees Classifier
ET = ExtraTreesClassifier(n_estimators=100, max_depth=3,criterion="entropy")

# calculating the accuracy and training time of the extra trees Classifier
model_acc_time(X_train1, X_test1, y_train1, y_test1, ET)

              Open          High           Low        Volume
1608   9167.824219   9207.810547   8998.216797  1.727309e+10
359     891.924011    903.252014    891.687012  1.208310e+08
1051   3544.761475   3701.349365   3487.169189  5.911325e+09
134     704.504028    704.504028    662.804016  1.866940e+08
1724  11913.077148  13184.566406  11900.928711  4.341471e+10
1210   8055.206055   8687.520508   7924.670410  2.667797e+10
1655  11588.405273  11796.396484  11216.872070  2.752220e+10
1006   6387.240234   6400.069824   6342.370117  3.658640e+09
15      401.432007    408.945007    401.432007  7.309310e+07
633    5747.950195   5976.799805   5721.220215  1.905040e+09


'Accuracy: 99.73% || Time to Train: 0.082 seconds'

In [114]:
# SGD Classifier
SGD = SGDClassifier(loss="hinge", max_iter=200)

# calculating the accuracy and training time of the SGD Classifier
model_acc_time(X_train1, X_test1, y_train1, y_test1, SGD)

              Open          High           Low        Volume
1608   9167.824219   9207.810547   8998.216797  1.727309e+10
359     891.924011    903.252014    891.687012  1.208310e+08
1051   3544.761475   3701.349365   3487.169189  5.911325e+09
134     704.504028    704.504028    662.804016  1.866940e+08
1724  11913.077148  13184.566406  11900.928711  4.341471e+10
1210   8055.206055   8687.520508   7924.670410  2.667797e+10
1655  11588.405273  11796.396484  11216.872070  2.752220e+10
1006   6387.240234   6400.069824   6342.370117  3.658640e+09
15      401.432007    408.945007    401.432007  7.309310e+07
633    5747.950195   5976.799805   5721.220215  1.905040e+09


'Accuracy: 0.00% || Time to Train: 0.006 seconds'

In [115]:
# Gaussian Naive Bayes
GNB = GaussianNB()
# calculating the accuracy and training time of the Gaussian Naive Bayes
model_acc_time(X_train1, X_test1, y_train1, y_test1, GNB)

              Open          High           Low        Volume
1608   9167.824219   9207.810547   8998.216797  1.727309e+10
359     891.924011    903.252014    891.687012  1.208310e+08
1051   3544.761475   3701.349365   3487.169189  5.911325e+09
134     704.504028    704.504028    662.804016  1.866940e+08
1724  11913.077148  13184.566406  11900.928711  4.341471e+10
1210   8055.206055   8687.520508   7924.670410  2.667797e+10
1655  11588.405273  11796.396484  11216.872070  2.752220e+10
1006   6387.240234   6400.069824   6342.370117  3.658640e+09
15      401.432007    408.945007    401.432007  7.309310e+07
633    5747.950195   5976.799805   5721.220215  1.905040e+09


'Accuracy: 91.78% || Time to Train: 0.003 seconds'

In [116]:
# SVM
SVM = svm.SVC(kernel="sigmoid")

# calculating the accuracy and training time of the SVM
model_acc_time(X_train1, X_test1, y_train1, y_test1, SVM)

              Open          High           Low        Volume
1608   9167.824219   9207.810547   8998.216797  1.727309e+10
359     891.924011    903.252014    891.687012  1.208310e+08
1051   3544.761475   3701.349365   3487.169189  5.911325e+09
134     704.504028    704.504028    662.804016  1.866940e+08
1724  11913.077148  13184.566406  11900.928711  4.341471e+10
1210   8055.206055   8687.520508   7924.670410  2.667797e+10
1655  11588.405273  11796.396484  11216.872070  2.752220e+10
1006   6387.240234   6400.069824   6342.370117  3.658640e+09
15      401.432007    408.945007    401.432007  7.309310e+07
633    5747.950195   5976.799805   5721.220215  1.905040e+09


'Accuracy: 58.08% || Time to Train: 0.023 seconds'

### models for two day slider

In [85]:
n = 3

In [23]:
testBTC = pd.DataFrame({"Date":["2016-5-20","2016-5-21","2016-5-22","2016-5-23"], "Open":[5,10,7,15],"Close":[10,9,8,12],"Volume":[20,30,25,15]})
testBTC

Unnamed: 0,Date,Open,Close,Volume
0,2016-5-20,5,10,20
1,2016-5-21,10,9,30
2,2016-5-22,7,8,25
3,2016-5-23,15,12,15


In [24]:
testBTC["Inc"] = testBTC.Close > testBTC.Open
testBTC

Unnamed: 0,Date,Open,Close,Volume,Inc
0,2016-5-20,5,10,20,True
1,2016-5-21,10,9,30,False
2,2016-5-22,7,8,25,True
3,2016-5-23,15,12,15,False


In [25]:
incTest = testBTC.Inc[1:].reset_index(drop=True) 
incTest

0    False
1     True
2    False
Name: Inc, dtype: bool

In [94]:
n = 3

In [95]:
openSum = []
closeSum = []
volSum = []
priceIncrease = []
for i in range(n-1,len(testBTC)):
    open = 0
    close = 0
    volume = 0

    for j in range(n):
        open += testBTC["Open"][i-j]
        close += testBTC["Close"][i-j]
        volume += testBTC["Volume"][i-j]

    openSum.append(open)
    closeSum.append(close)
    volSum.append(volume)
    # moves increase labels up one
    priceIncrease.append(testBTC["Inc"][i-1])

In [96]:
openSum, closeSum, volSum, priceIncrease

([22, 32], [27, 29], [75, 70], [False, True])