In [0]:
# load SQL table into Spark data frame
df = spark.table('workspace.csv.btc_usd_YTD')

# convert to pandas
pdf = df.toPandas()

# change "snapped_at' column name to 'date'
pdf['snapped_at'] = pdf['snapped_at'].dt.date

# remane snapped at column
pdf.rename(columns={'snapped_at':'date'},inplace=True)

# drop total_volume column for now since we don't plan to use it
pdf.drop(columns=['total_volume'], inplace=True)

# create target column from price column shifted up by one
# doing this makes each columns' target the next day's price. This is how we teach the model to predict price
pdf['target'] = pdf['price'].shift(-1)

# for now, we dont want an empy row at the end so we drop it
pdf.dropna(inplace=True)

# sort by date
pdf = pdf.sort_values(by='date')

display(pdf)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Features (input values for Model)
features = ['price', 'market_cap']
x = pdf[features]

# Target (output value we want the model to predict)
y = pdf['target']

# split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x,y,
    # for this model we are using dates as the index, so we want to keep the order
    shuffle=False,
    # test set will be 20% of the data, training will be the other 80%
    test_size=0.2
)

# Train the model
model = LinearRegression()
model.fit(x_train, y_train)

# Model Prediction
predictions = model.predict(x_test)


In [0]:
import pandas as pd
# reset the index on test set to avoid misaligned rows/cols
x_test_reset = x_test.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)

# extract the date values for the test-set rows from the original dataframe,
# thenreset the index so the dates align 1-to-1 with x_test after splitting.
date_col = pdf.loc[x_test.index, 'date'].reset_index(drop=True)

# build results table
results = pd.DataFrame({
    'date': date_col,
    'price': x_test_reset['price'].round(2),
    'predicted_price': predictions.round(2)
})

# to results table, add col for difference between price and predicted price 
results['difference'] = (results['predicted_price'] - results['price']).round(2)
# add col for margin of error in percent 
results['% Error'] = ((results['price'] - results['predicted_price']).abs() / results['price'] * 100).round(2)

# show results
display(results)


