# imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# load

In [3]:
df = pd.read_csv('datasets/crypto/coinmarke_bitcoin_365d_2024-Jun-6.csv', delimiter=';')
# df = pd.read_csv('datasets/FinalBalancedDataset.csv')

In [4]:
df.shape

(364, 12)

In [6]:
df.head()

Unnamed: 0,timeOpen,timeClose,timeHigh,timeLow,name,open,high,low,close,volume,marketCap,timestamp
0,2024-06-06T00:00:00.000Z,2024-06-06T23:59:59.999Z,2024-06-06T15:29:00.000Z,2024-06-06T20:03:00.000Z,2781,71082.840431,71625.737249,70119.123034,70757.165852,25223150000.0,1394552000000.0,2024-06-06T23:59:59.999Z
1,2024-06-05T00:00:00.000Z,2024-06-05T23:59:59.999Z,2024-06-05T16:15:00.000Z,2024-06-05T14:17:00.000Z,2781,70568.350292,71735.41098,70390.712691,71082.822704,32810770000.0,1400901000000.0,2024-06-05T23:59:59.999Z
2,2024-06-04T00:00:00.000Z,2024-06-04T23:59:59.999Z,2024-06-04T18:26:00.000Z,2024-06-04T09:54:00.000Z,2781,68804.567223,71047.406014,68564.643221,70567.768904,33149700000.0,1390744000000.0,2024-06-04T23:59:59.999Z
3,2024-06-03T00:00:00.000Z,2024-06-03T23:59:59.999Z,2024-06-03T13:50:00.000Z,2024-06-03T00:39:00.000Z,2781,67753.895414,70230.82085,67589.836415,68804.780085,32401290000.0,1355956000000.0,2024-06-03T23:59:59.999Z
4,2024-06-02T00:00:00.000Z,2024-06-02T23:59:59.999Z,2024-06-02T14:35:00.000Z,2024-06-02T19:19:00.000Z,2781,67710.270892,68409.16583,67315.526314,67751.602575,17110590000.0,1335228000000.0,2024-06-02T23:59:59.999Z


In [30]:
df['timestamp'][0]

'2024-06-06T23:59:59.999Z'

In [7]:
# Convert the 'timestamp' column to datetime and extract the date
df['date'] = pd.to_datetime(df['timestamp']).dt.date
df['date']

0      2024-06-06
1      2024-06-05
2      2024-06-04
3      2024-06-03
4      2024-06-02
          ...    
359    2023-06-13
360    2023-06-12
361    2023-06-11
362    2023-06-10
363    2023-06-09
Name: date, Length: 364, dtype: object

The ordinal number for a date is the number of days since a fixed reference date, which in the case of Python's datetime module is January 1, 1. This method allows us to convert dates into a numerical format that can be used in mathematical calculations or machine learning algorithms.

In [8]:
# Convert the date to an ordinal number
df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal())

In [9]:
df['date_ordinal'].head()

0    739043
1    739042
2    739041
3    739040
4    739039
Name: date_ordinal, dtype: int64

In [10]:
df.drop(columns=['timeOpen', 'timeClose', 'timestamp', 'name'], inplace=True)
df.head(2)

Unnamed: 0,timeHigh,timeLow,open,high,low,close,volume,marketCap,date,date_ordinal
0,2024-06-06T15:29:00.000Z,2024-06-06T20:03:00.000Z,71082.840431,71625.737249,70119.123034,70757.165852,25223150000.0,1394552000000.0,2024-06-06,739043
1,2024-06-05T16:15:00.000Z,2024-06-05T14:17:00.000Z,70568.350292,71735.41098,70390.712691,71082.822704,32810770000.0,1400901000000.0,2024-06-05,739042


In [11]:
df_high = df[['date_ordinal', 'open', 'low', 'close', 'volume', 'marketCap', 'high']]
df_high.head(3)

Unnamed: 0,date_ordinal,open,low,close,volume,marketCap,high
0,739043,71082.840431,70119.123034,70757.165852,25223150000.0,1394552000000.0,71625.737249
1,739042,70568.350292,70390.712691,71082.822704,32810770000.0,1400901000000.0,71735.41098
2,739041,68804.567223,68564.643221,70567.768904,33149700000.0,1390744000000.0,71047.406014


In [12]:
df_low = df[['date_ordinal', 'open', 'high', 'close', 'volume', 'marketCap', 'low']]
df_low.head(3)

Unnamed: 0,date_ordinal,open,high,close,volume,marketCap,low
0,739043,71082.840431,71625.737249,70757.165852,25223150000.0,1394552000000.0,70119.123034
1,739042,70568.350292,71735.41098,71082.822704,32810770000.0,1400901000000.0,70390.712691
2,739041,68804.567223,71047.406014,70567.768904,33149700000.0,1390744000000.0,68564.643221


# Linear Regression model to predict High

In [13]:
X = df[['date_ordinal', 'open', 'low', 'close', 'volume', 'marketCap']]
y = df['high']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) #85 percent training

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

## predict and evaluate

In [16]:
# Make predictions
y_pred = model.predict(X_test)

In [17]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Print out the model coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)

Mean Squared Error: 133673.92608235386
Model Coefficients: [-1.79964776e+00  4.61918195e-01 -6.71807714e-02 -1.25683958e+00
  3.50414253e-08  9.46208723e-08]
Model Intercept: 1329937.3893956975


## predict a single date 

In [19]:
from datetime import datetime, timedelta

In [20]:
def date_to_ordinal(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    return date_obj.toordinal()

tomorrow_date = '2024-06-08'
date_ordinal = date_to_ordinal(tomorrow_date)

In [21]:
# Create a new data point for tomorrow using historical averages or predictions
# Example values: Using historical averages for simplicity
new_data_point = pd.DataFrame({
    'date_ordinal': [date_ordinal],
    'open': [70759],  # Historical average or future prediction value
    'low': [68507],   # Historical average or future prediction value
    'close': [69342], # Historical average or future prediction value
    'volume': [36188381096],   # Historical average or future prediction value
    'marketCap': [1,366686633129] # Historical average or future prediction value
})

In [22]:
predicted_high = model.predict(new_data_point)
print(f"Predicted 'high' for {tomorrow_date}: {predicted_high[0]}")

Predicted 'high' for 2024-06-08: 69935.52253678488
