In [1]:
# Load the data

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('/content/drive/Shareddrives/CMPE 257 - BitSmart/BTC-USD5years.csv')
df['Date'] = pd.to_datetime(df['Date'])
# Dropping NaN values
df.dropna(inplace=True)

# Add Moving Average for better result
df['MA7_Close'] = df['Close'].rolling(window=7).mean().fillna(method='bfill')

print(df.columns)
df.head()


Mounted at /content/drive
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'MA7_Close'],
      dtype='object')


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA7_Close
0,2019-05-09,5982.316406,6183.039063,5982.316406,6174.528809,6174.528809,16784645411,7249.288574
1,2019-05-10,6175.822754,6434.617676,6161.519043,6378.849121,6378.849121,19419875368,7249.288574
2,2019-05-11,6379.666992,7333.00293,6375.69873,7204.771484,7204.771484,28867562329,7249.288574
3,2019-05-12,7203.507324,7503.87207,6815.770996,6972.371582,6972.371582,27773333680,7249.288574
4,2019-05-13,6971.178223,8047.413086,6898.282227,7814.915039,7814.915039,28677672181,7249.288574


In [2]:
# Sort DataFrame by date
df = df.sort_values(by='Date')
df.set_index('Date', inplace=True)

# Reindex the DataFrame to fill in any missing dates
all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
df = df.reindex(all_dates, method='ffill')

# Reset index if necessary
df.reset_index(inplace=True)
df.rename(columns={'index': 'Date'}, inplace=True)

df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA7_Close
1821,2024-05-03,59122.300781,63320.503906,58848.3125,62889.835938,62889.835938,33172023048,61611.089844
1822,2024-05-04,62891.03125,64494.957031,62599.351563,63891.472656,63891.472656,20620477992,61678.565848
1823,2024-05-05,63892.453125,64610.890625,62955.304688,64031.132813,64031.132813,18296164805,61809.694755
1824,2024-05-06,64038.3125,65494.902344,62746.238281,63161.949219,63161.949219,28697928697,61712.670201
1825,2024-05-07,63162.761719,64390.457031,62285.980469,62334.816406,62334.816406,25930730982,61955.236049


In [5]:
# Further preprocess the dateset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the dataset into training and testing set

features = df[['Open', 'High', 'Low', 'Volume', 'MA7_Close']]
target = df['Close']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=88)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Function to evaluate multiple models
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def evaluate_models(models, X, y, num_folds=5):
    results = {}
    for name, model in models.items():
        cv_scores = cross_val_score(model(), X, y, cv=num_folds, scoring='neg_mean_squared_error')
        cv_scores_mean = -cv_scores.mean()
        results[name] = cv_scores_mean
        print(f"{name} - MSE scores: {-cv_scores}, Mean MSE: {cv_scores_mean}")
    return results

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

def linear_regression_factory():
  return LinearRegression()

def ridge_factory():
  return Ridge()

def lasso_factory():
  return Lasso(alpha=0.1, max_iter=10000, tol=0.01)

def random_forest_regressor_factory():
  return RandomForestRegressor(random_state=42)

def gradient_boosting_regressor_factory():
  return GradientBoostingRegressor(random_state=42)

# Models to evaluate
models = {
    "Linear Regression": linear_regression_factory,
    "Ridge Regression": ridge_factory,
    "Lasso Regression": lasso_factory,
    "Random Forest": random_forest_regressor_factory,
    "Gradient Boosting": gradient_boosting_regressor_factory
}

# Evaluate and find the best model using cross-validation
model_results = evaluate_models(models, X_train_scaled, y_train)
best_model = min(model_results, key=model_results.get)
selected_model = models[best_model]
print(f"Best model: {best_model} with Mean MSE: {model_results[best_model]}")

Linear Regression - MSE scores: [229295.23468077 207965.20790074 199017.74908475 213469.01116665
 216706.97098074], Mean MSE: 213290.83476272976
Ridge Regression - MSE scores: [380131.60746328 243527.61963687 250102.29722313 363470.27384439
 301456.00771946], Mean MSE: 307737.56117742683
Lasso Regression - MSE scores: [1790857.02849859 1162293.63334306  921705.18760126 1788771.80562607
 1239493.82833697], Mean MSE: 1380624.2966811894
Random Forest - MSE scores: [575221.12076059 333099.15190833 381093.27218989 482519.93402495
 375366.56013306], Mean MSE: 429460.00780336343
Gradient Boosting - MSE scores: [584294.48774602 412499.18694207 380430.10345624 571943.52515337
 448424.59431816], Mean MSE: 479518.3795231728
Best model: Linear Regression with Mean MSE: 213290.83476272976


In [9]:
# Prediction of the next 7 days
# Creating lag features for 7-day predictions
for shift in range(1, 8):
    df[f'lag_close_{shift}'] = df['Close'].shift(shift)

# Dropping rows with NaN values resulting from lag creation
df.dropna(inplace=True)

# Ensure the 'Date' column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Calculate future target values
df['future_high'] = df['High'].rolling(window=7).max().shift(-7)  # Highest price in the next 7 days
df['future_low'] = df['Low'].rolling(window=7).min().shift(-7)    # Lowest price in the next 7 days
df['future_avg'] = df['Close'].rolling(window=7).mean().shift(-7) # Average closing price in the next 7 days
df['future_open'] = df['Open'].shift(-7)  # Open price of the 7th day from today
df['future_close'] = df['Close'].shift(-7)  # Close price of the 7th day from today

# Replace NaNs in future target values with the mean of their respective columns
df['future_high'].fillna(df['future_high'].mean(), inplace=True)
df['future_low'].fillna(df['future_low'].mean(), inplace=True)
df['future_avg'].fillna(df['future_avg'].mean(), inplace=True)
df['future_open'].fillna(df['future_open'].mean(), inplace=True)
df['future_close'].fillna(df['future_close'].mean(), inplace=True)

# Define features and targets
features = df[[f'lag_close_{shift}' for shift in range(1, 8)] + ['Volume']]
target_high = df['future_high']
target_low = df['future_low']
target_avg = df['future_avg']
target_open = df['future_open']
target_close = df['future_close']

# Splitting data into training and test sets
X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(features, target_high, test_size=0.3, random_state=88)
X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(features, target_low, test_size=0.3, random_state=88)
X_train_avg, X_test_avg, y_train_avg, y_test_avg = train_test_split(features, target_avg, test_size=0.3, random_state=88)
X_train_open, X_test_open, y_train_open, y_test_open = train_test_split(features, target_open, test_size=0.3, random_state=88)
X_train_close, X_test_close, y_train_close, y_test_close = train_test_split(features, target_close, test_size=0.3, random_state=88)

# Train models
model_high = selected_model().fit(X_train_high, y_train_high)
model_low = selected_model().fit(X_train_low, y_train_low)
model_avg = selected_model().fit(X_train_avg, y_train_avg)
model_open = selected_model().fit(X_train_open, y_train_open)
model_close = selected_model().fit(X_train_close, y_train_close)

In [55]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MA7_Close,lag_close_1,lag_close_2,lag_close_3,lag_close_4,lag_close_5,lag_close_6,lag_close_7,future_high,future_low,future_avg
1821,2024-05-03,59122.300781,63320.503906,58848.3125,62889.835938,62889.835938,33172023048,61611.089844,59123.433594,58254.011719,60636.855469,63841.121094,63113.230469,63419.140625,63755.320313,30160.093261,26371.718016,28355.606125
1822,2024-05-04,62891.03125,64494.957031,62599.351563,63891.472656,63891.472656,20620477992,61678.565848,62889.835938,59123.433594,58254.011719,60636.855469,63841.121094,63113.230469,63419.140625,30160.093261,26371.718016,28355.606125
1823,2024-05-05,63892.453125,64610.890625,62955.304688,64031.132813,64031.132813,18296164805,61809.694755,63891.472656,62889.835938,59123.433594,58254.011719,60636.855469,63841.121094,63113.230469,30160.093261,26371.718016,28355.606125
1824,2024-05-06,64038.3125,65494.902344,62746.238281,63161.949219,63161.949219,28697928697,61712.670201,64031.132813,63891.472656,62889.835938,59123.433594,58254.011719,60636.855469,63841.121094,30160.093261,26371.718016,28355.606125
1825,2024-05-07,63162.761719,64390.457031,62285.980469,62334.816406,62334.816406,25930730982,61955.236049,63161.949219,64031.132813,63891.472656,62889.835938,59123.433594,58254.011719,60636.855469,30160.093261,26371.718016,28355.606125


In [12]:
# Function for Date Selection and Prediction
def predict_for_selected_date(selected_date, data, model_high, model_low, model_avg):
    selected_date = pd.to_datetime(selected_date)
    if selected_date not in data['Date'].values:
        raise Exception("Selected date is not available in the dataset.")

    # Extract the record for the selected date
    selected_record = data[data['Date'] == selected_date]
    if selected_record.empty:
        raise Exception("No data available for the selected date.")


    # Prepare the feature vector
    feature_names = ['lag_close_1', 'lag_close_2', 'lag_close_3', 'lag_close_4',
                     'lag_close_5', 'lag_close_6', 'lag_close_7', 'Volume']
    feature_vector = selected_record[feature_names].values.reshape(1, -1)

    feature_vector_df = pd.DataFrame(feature_vector, columns=feature_names)

    # Make predictions
    predicted_high = model_high.predict(feature_vector_df)[0]
    predicted_low = model_low.predict(feature_vector_df)[0]
    predicted_avg = model_avg.predict(feature_vector_df)[0]


    return predicted_high, predicted_low, predicted_avg

In [14]:
# Example date to test
test_date = '2024-02-23'

# Call the prediction function
try:
    predictions = predict_for_selected_date(test_date, df, model_high, model_low, model_avg)
    predicted_high, predicted_low, predicted_avg = predictions
    print(f"Predictions for {test_date}:")
    print(f"Highest Price: {predicted_high}, Lowest Price: {predicted_low}, Average Closing Price: {predicted_avg}")
except Exception as e:
    print(e)

Predictions for 2024-02-23:
Highest Price: 53631.70962358406, Lowest Price: 47267.73615569122, Average Closing Price: 50596.064654148286


In [16]:
def predict_next_7_days(selected_date, data, model_open, model_high, model_low, model_close):
    selected_date = pd.to_datetime(selected_date)
    prediction_results = []

    # Check if the selected date is available in the dataset
    if selected_date not in data['Date'].values:
        raise ValueError("Selected date is not available in the dataset.")

    # Ensure that there are at least 7 days of data available after the selected date for prediction
    max_date_in_data = data['Date'].max()
    if selected_date + pd.Timedelta(days=7) > max_date_in_data:
        raise ValueError("Not enough data available to predict for the next 7 days.")

    for i in range(7):
        future_date = selected_date + pd.Timedelta(days=i)
        if future_date > max_date_in_data:
            break  # Stop if we run out of actual data to base predictions on

        # Prepare feature vector for this future date
        record = data[data['Date'] == future_date]
        if record.empty:
            continue  # Skip if no data available for this future date

        feature_vector = record[['lag_close_1', 'lag_close_2', 'lag_close_3', 'lag_close_4', 'lag_close_5', 'lag_close_6', 'lag_close_7', 'Volume']].values.reshape(1, -1)
        feature_vector_df = pd.DataFrame(feature_vector, columns=['lag_close_1', 'lag_close_2', 'lag_close_3', 'lag_close_4', 'lag_close_5', 'lag_close_6', 'lag_close_7', 'Volume'])

        # Predict using the models
        predicted_open = model_open.predict(feature_vector_df)[0]
        predicted_high = model_high.predict(feature_vector_df)[0]
        predicted_low = model_low.predict(feature_vector_df)[0]
        predicted_close = model_close.predict(feature_vector_df)[0]

        # Append the predictions for this date to the results list
        prediction_results.append({
            'Date': future_date.strftime('%Y-%m-%d'),
            'Open': predicted_open,
            'High': predicted_high,
            'Low': predicted_low,
            'Close': predicted_close
        })

    return prediction_results

In [17]:
# Example usage
selected_date = '2024-04-21'
try:
    predictions = predict_next_7_days(selected_date, df, model_open, model_high, model_low, model_close)
    for day_prediction in predictions:
        print(day_prediction)
except Exception as e:
    print(e)

{'Date': '2024-04-21', 'Open': 63180.01992203719, 'High': 67341.39676740873, 'Low': 59111.97311031085, 'Close': 63202.09671518376}
{'Date': '2024-04-22', 'Open': 63817.640081534424, 'High': 67770.3804635609, 'Low': 59634.19173110796, 'Close': 63947.391486755994}
{'Date': '2024-04-23', 'Open': 65572.12210731528, 'High': 69706.34473089722, 'Low': 61180.01286674071, 'Close': 65472.93668853245}
{'Date': '2024-04-24', 'Open': 65810.72402821411, 'High': 69971.69497633025, 'Low': 61457.90399666331, 'Close': 65891.60263960148}
{'Date': '2024-04-25', 'Open': 63432.74157646354, 'High': 67603.49327340742, 'Low': 58931.53804830728, 'Close': 63478.55874526154}
{'Date': '2024-04-26', 'Open': 64039.19678678134, 'High': 67598.58193585438, 'Low': 59627.996484545, 'Close': 63853.85283075537}
{'Date': '2024-04-27', 'Open': 63208.67684687524, 'High': 66774.82693543959, 'Low': 58880.18091589218, 'Close': 63124.7834325715}


In [18]:
# The Swing Trading Strategy Function Normal
def swing_trading_strategy(selected_date, data, model_high, model_low, model_avg):
    selected_date = pd.to_datetime(selected_date)
    last_possible_date = data['Date'].max()

    # Calculate how many days of data are available after the selected date
    available_days = (last_possible_date - selected_date).days + 1

    if available_days < 1:
        return "Selected date is not available or no data available after the selected date."

    # Determine the number of days to predict based on available data
    prediction_days = min(available_days, 7)

    # Initialize trading state
    initial_open_price = data.loc[data['Date'] == selected_date, 'Open'].values[0]
    bitcoins = 100000 / initial_open_price
    cash = 0

    sell_executed = False
    load_executed = False
    sell_day = None
    load_day = None

    # Loop through available days to make predictions and decide on actions
    for i in range(prediction_days):
        prediction_date = selected_date + pd.Timedelta(days=i)
        predicted_high, predicted_low, predicted_avg = predict_for_selected_date(prediction_date, data, model_high, model_low, model_avg)

        # Decide when to sell: if predicted high price is 5% higher than the open price
        if not sell_executed and predicted_high > initial_open_price * 1.05:
            sell_executed = True
            sell_day = prediction_date
            cash = bitcoins * predicted_high
            bitcoins = 0

        # Decide when to load: after selling, if the predicted low price is 5% lower
        if sell_executed and not load_executed and predicted_low < predicted_high * 0.95:
            if prediction_date != sell_day:  # Ensure not loading on the sell day
                load_executed = True
                load_day = prediction_date
                bitcoins = cash / predicted_low
                cash = 0

    # Final valuation at the end of the prediction period or the last available day
    final_day = selected_date + pd.Timedelta(days=prediction_days-1)
    final_close_price = data.loc[data['Date'] == final_day, 'Close'].values[0]
    final_cash = cash if cash > 0 else bitcoins * final_close_price

    return {
        "sell_day": sell_day,
        "load_day": load_day,
        "final_cash": final_cash,
        "final_bitcoins": bitcoins if cash == 0 else 0,
        "final_value": final_cash
    }


result = swing_trading_strategy(test_date, df, model_high, model_low, model_avg)
print(result)

{'sell_day': Timestamp('2024-02-26 00:00:00'), 'load_day': Timestamp('2024-02-27 00:00:00'), 'final_cash': 131430.52796334177, 'final_bitcoins': 2.1476143963631467, 'final_value': 131430.52796334177}


In [19]:
# The Swing Trading Strategy Function Greedy
def swing_trading_strategy(selected_date, data, model_high, model_low, model_avg):
    selected_date = pd.to_datetime(selected_date)
    last_possible_date = data['Date'].max()

    # Calculate how many days of data are available after the selected date
    available_days = (last_possible_date - selected_date).days + 1

    if available_days < 1:
        return "Selected date is not available or no data available after the selected date."

    # Determine the number of days to predict based on available data
    prediction_days = min(available_days, 7)

    # Initialize trading state
    initial_open_price = data.loc[data['Date'] == selected_date, 'Open'].values[0]
    bitcoins = 100000 / initial_open_price
    cash = 0

    sell_executed = False
    load_executed = False
    sell_day = None
    load_day = None

    # Loop through available days to make predictions and decide on actions
    for i in range(prediction_days):
        prediction_date = selected_date + pd.Timedelta(days=i)
        predicted_high, predicted_low, predicted_avg = predict_for_selected_date(prediction_date, data, model_high, model_low, model_avg)
        print(f'{prediction_date}, {predicted_high}, {predicted_low}, {predicted_avg}')

        # Decide when to sell: if predicted high price is 15% higher than the open price
        if not sell_executed and predicted_high > initial_open_price * 1.15:
            sell_executed = True
            sell_day = prediction_date
            cash = bitcoins * predicted_high
            bitcoins = 0

        # Decide when to load: after selling, if the predicted low price is 15% lower
        if sell_executed and not load_executed and predicted_low < predicted_high * 0.85:
            if prediction_date != sell_day:  # Ensure not loading on the sell day
                load_executed = True
                load_day = prediction_date
                bitcoins = cash / predicted_low
                cash = 0

    # Final valuation at the end of the prediction period or the last available day
    final_day = selected_date + pd.Timedelta(days=prediction_days-1)
    final_close_price = data.loc[data['Date'] == final_day, 'Close'].values[0]
    final_cash = cash if cash > 0 else bitcoins * final_close_price

    return {
        "sell_day": sell_day,
        "load_day": load_day,
        "final_cash": final_cash,
        "final_bitcoins": bitcoins if cash == 0 else 0,
        "final_value": final_cash
    }


result = swing_trading_strategy(test_date, df, model_high, model_low, model_avg)
print(result)

2024-02-23 00:00:00, 53631.70962358406, 47267.73615569122, 50596.064654148286
2024-02-24 00:00:00, 52956.485917732614, 47121.04942095118, 50182.48514630771
2024-02-25 00:00:00, 53741.90102900483, 47732.20610211675, 50867.03194203708
2024-02-26 00:00:00, 54412.868211569265, 47472.8975979284, 51159.047964199155
2024-02-27 00:00:00, 57455.68683400501, 49404.23909939236, 53696.74005662734
2024-02-28 00:00:00, 61074.68654699964, 51077.756362633896, 56475.56443824612
2024-02-29 00:00:00, 66256.39278295418, 56497.36352608861, 61604.56874336205
{'sell_day': Timestamp('2024-02-28 00:00:00'), 'load_day': None, 'final_cash': 119091.33100991041, 'final_bitcoins': 0, 'final_value': 119091.33100991041}
