<a href="https://colab.research.google.com/github/bonareri/Bitcoin-Prediction-Analysis/blob/main/data_collection_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1: Install Required Libraries

In [None]:
!pip install yfinance



## Importing Libraries

In [None]:
# Data handling and visualization
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Date handling
from datetime import datetime

In [None]:
# Get Bitcoin historical data
btc = yf.Ticker("BTC-USD")  # Bitcoin in USD
btc_hist = btc.history(period="10y")  # Fetch last 10 years of data

In [None]:
# Display first 5 rows
print(btc_hist.head())

                                 Open        High         Low       Close  \
Date                                                                        
2015-02-06 00:00:00+00:00  216.923004  230.509995  216.231995  222.266006   
2015-02-07 00:00:00+00:00  222.632996  230.298996  222.606995  227.753998   
2015-02-08 00:00:00+00:00  227.692993  229.438004  221.076996  223.412003   
2015-02-09 00:00:00+00:00  223.389008  223.977005  217.018997  220.110001   
2015-02-10 00:00:00+00:00  220.281998  221.807007  215.332001  219.839005   

                             Volume  Dividends  Stock Splits  
Date                                                          
2015-02-06 00:00:00+00:00  24435300        0.0           0.0  
2015-02-07 00:00:00+00:00  21604200        0.0           0.0  
2015-02-08 00:00:00+00:00  17145200        0.0           0.0  
2015-02-09 00:00:00+00:00  27791300        0.0           0.0  
2015-02-10 00:00:00+00:00  21115100        0.0           0.0  


In [None]:
# Save to CSV
btc_hist.to_csv("bitcoin_prices_yfinance.csv")

In [None]:
# Read the CSV file with the Date column parsed as datetime and set as index
df = pd.read_csv("bitcoin_prices_yfinance.csv", index_col='Date', parse_dates=True)

# Display the first few rows to verify
print("Initial Data:")
print(df.head())

Initial Data:
                                 Open        High         Low       Close  \
Date                                                                        
2015-02-06 00:00:00+00:00  216.923004  230.509995  216.231995  222.266006   
2015-02-07 00:00:00+00:00  222.632996  230.298996  222.606995  227.753998   
2015-02-08 00:00:00+00:00  227.692993  229.438004  221.076996  223.412003   
2015-02-09 00:00:00+00:00  223.389008  223.977005  217.018997  220.110001   
2015-02-10 00:00:00+00:00  220.281998  221.807007  215.332001  219.839005   

                             Volume  Dividends  Stock Splits  
Date                                                          
2015-02-06 00:00:00+00:00  24435300        0.0           0.0  
2015-02-07 00:00:00+00:00  21604200        0.0           0.0  
2015-02-08 00:00:00+00:00  17145200        0.0           0.0  
2015-02-09 00:00:00+00:00  27791300        0.0           0.0  
2015-02-10 00:00:00+00:00  21115100        0.0           0.0  


## Data Cleaning

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3654 entries, 2015-02-06 00:00:00+00:00 to 2025-02-06 00:00:00+00:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          3654 non-null   float64
 1   High          3654 non-null   float64
 2   Low           3654 non-null   float64
 3   Close         3654 non-null   float64
 4   Volume        3654 non-null   int64  
 5   Dividends     3654 non-null   float64
 6   Stock Splits  3654 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 228.4 KB


In [None]:
# Drop the 'Dividends' and 'Stock Splits' columns
df.drop(columns=["Dividends", "Stock Splits"], inplace=True)

In [None]:
# Remove timezone info from the index, if present
df.index = df.index.tz_localize(None)

In [None]:
# Ensure the DataFrame is sorted by date (ascending order)
df.sort_index(inplace=True)

In [None]:
#Ensure date is set as the index
df.index = pd.to_datetime(df.index)

In [None]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-06,216.923004,230.509995,216.231995,222.266006,24435300
2015-02-07,222.632996,230.298996,222.606995,227.753998,21604200
2015-02-08,227.692993,229.438004,221.076996,223.412003,17145200
2015-02-09,223.389008,223.977005,217.018997,220.110001,27791300
2015-02-10,220.281998,221.807007,215.332001,219.839005,21115100


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3654 entries, 2015-02-06 to 2025-02-06
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    3654 non-null   float64
 1   High    3654 non-null   float64
 2   Low     3654 non-null   float64
 3   Close   3654 non-null   float64
 4   Volume  3654 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 171.3 KB


## Feature Engineering

- Daily Returns: Percentage change between consecutive closing prices.
- Moving Averages: 7-day and 30-day moving averages to smooth price fluctuations.

In [None]:
# Calculate daily percentage return based on the 'Close' price
df["Daily_Return"] = df["Close"].pct_change()

# Calculate 7-day and 30-day moving averages of the 'Close' price
df["MA7"] = df["Close"].rolling(window=7).mean()
df["MA30"] = df["Close"].rolling(window=30).mean()

In [None]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Daily_Return,MA7,MA30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-02-06,216.923004,230.509995,216.231995,222.266006,24435300,,,
2015-02-07,222.632996,230.298996,222.606995,227.753998,21604200,0.024691,,
2015-02-08,227.692993,229.438004,221.076996,223.412003,17145200,-0.019064,,
2015-02-09,223.389008,223.977005,217.018997,220.110001,27791300,-0.01478,,
2015-02-10,220.281998,221.807007,215.332001,219.839005,21115100,-0.001231,,


In [None]:
df.isnull().sum()

Unnamed: 0,0
Open,0
High,0
Low,0
Close,0
Volume,0
Daily_Return,1
MA7,6
MA30,29


In [None]:
# Drop rows with missing values
df_clean = df.dropna()

Missing values per column:
Open             0
High             0
Low              0
Close            0
Volume           0
Daily_Return     1
MA7              6
MA30            29
dtype: int64
Shape before cleaning: (3654, 8)
Shape after dropping missing values: (3625, 8)


In [None]:
#check misiing values
df_clean.isnull().sum()

Unnamed: 0,0
Open,0
High,0
Low,0
Close,0
Volume,0
Daily_Return,0
MA7,0
MA30,0


## Train Test Split

In [None]:
# Define the split ratio (e.g., 80% training, 20% testing)
train_size = int(len(df_clean) * 0.8)

In [None]:
# Split the data while maintaining chronological order
train_df_clean = df_clean.iloc[:train_size]
test_df_clean = df_clean.iloc[train_size:]

In [None]:
# Print information about the splits using the cleaned data
print("Training set period:", train_df_clean.index.min(), "to", train_df_clean.index.max())
print("Test set period:", test_df_clean.index.min(), "to", test_df_clean.index.max())
print("Training set shape:", train_df_clean.shape)
print("Test set shape:", test_df_clean.shape)

Training set period: 2015-03-07 00:00:00 to 2023-02-12 00:00:00
Test set period: 2023-02-13 00:00:00 to 2025-02-06 00:00:00
Training set shape: (2900, 8)
Test set shape: (725, 8)


## Save Preprocessed Data

In [None]:
# Save the cleaned (preprocessed) data to a new CSV file
df_clean.to_csv("bitcoin_prices_preprocessed.csv")

print("Preprocessed data saved as 'bitcoin_prices_preprocessed.csv'")
print(df_clean.head())

Preprocessed data saved as 'bitcoin_prices_preprocessed.csv'
                  Open        High         Low       Close    Volume  \
Date                                                                   
2015-03-07  272.294006  277.854004  270.132996  276.260986  17825900   
2015-03-08  276.433014  277.858002  272.565002  274.354004  22067900   
2015-03-09  274.812012  292.700989  273.893005  289.606995  59178200   
2015-03-10  289.862000  300.044006  289.743011  291.760010  67770800   
2015-03-11  291.524994  297.390991  290.507996  296.378998  33963900   

            Daily_Return         MA7        MA30  
Date                                              
2015-03-07      0.012973  273.689715  244.523235  
2015-03-08     -0.006903  275.711430  246.259502  
2015-03-09      0.055596  277.702427  248.321268  
2015-03-10      0.007434  279.139287  250.599535  
2015-03-11      0.015831  282.465999  253.141835  


In [51]:
train_df_clean.to_csv('train_data.csv', index=False)
test_df_clean.to_csv('test_data.csv', index=False)

In [53]:
train_df_clean.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Daily_Return,MA7,MA30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-02-08,23263.416016,23367.958984,22731.097656,22939.398438,25371367758,-0.013965,23167.500837,22059.346289
2023-02-09,22946.566406,22996.4375,21773.974609,21819.039062,32572572185,-0.04884,22931.381975,22205.104492
2023-02-10,21819.005859,21941.185547,21539.392578,21651.183594,27078406594,-0.007693,22674.505022,22328.980729
2023-02-11,21651.841797,21891.410156,21618.449219,21870.875,16356226232,0.010147,22465.794643,22429.023633
2023-02-12,21870.902344,22060.994141,21682.828125,21788.203125,17821046406,-0.00378,22299.01423,22491.644596


In [55]:
test_df_clean.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Daily_Return,MA7,MA30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-02-02,100661.539062,101430.664062,96216.078125,97688.976562,63091816853,-0.029476,101801.226562,100290.303385
2025-02-03,97681.101562,102514.171875,91242.890625,101405.421875,115400897748,0.038044,101703.760045,100395.943229
2025-02-04,101398.71875,101745.617188,96208.109375,97871.820312,73002130211,-0.034846,101209.38058,100381.171875
2025-02-05,97878.007812,99113.210938,96174.828125,96615.445312,49125911241,-0.012837,100196.842634,100199.083854
2025-02-06,96547.320312,98406.382812,96547.320312,98223.695312,45894361088,0.016646,99266.612723,100242.45026
