<a href="https://colab.research.google.com/github/bonareri/Bitcoin-Prediction-Analysis/blob/main/Crypto_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data collection

In [10]:
!pip install yfinance pandas



In [46]:
import yfinance as yf
import pandas as pd
import requests

In [49]:
# Define the coins you want to compare
coins = {
    "bitcoin": "BTC-USD",
    "ethereum": "ETH-USD",
    "binancecoin": "BNB-USD",
    "solana": "SOL-USD",
    "dogecoin": "DOGE-USD",
}

# Function to get circulating supply from CoinGecko
def get_circulating_supply(coin_id):
    url = f"https://api.coingecko.com/api/v3/coins/{coin_id}"
    response = requests.get(url)
    data = response.json()
    return data["market_data"]["circulating_supply"]

# Create an empty DataFrame to store all coin data
all_data = []

for coin, ticker in coins.items():
    print(f"Fetching data for {coin}...")

    # Get historical price data from Yahoo Finance
    crypto = yf.Ticker(ticker)
    df = crypto.history(period="max")

    # Reset index to make Date a column
    df.reset_index(inplace=True)

    # Get circulating supply from CoinGecko
    circulating_supply = get_circulating_supply(coin)

    # Calculate Market Cap
    df["Market Cap"] = df["Close"] * circulating_supply

    # Add a column for the coin name
    df["Coin"] = coin.capitalize()

    # Append to list
    all_data.append(df)

# Combine all data into one DataFrame
final_df = pd.concat(all_data, ignore_index=True)

# Save to CSV for Tableau
final_df.to_csv("crypto_comparison.csv", index=False)

print("Crypto comparison data saved to crypto_comparison.csv")

Fetching data for bitcoin...
Fetching data for ethereum...
Fetching data for binancecoin...
Fetching data for solana...
Fetching data for dogecoin...
Crypto comparison data saved to crypto_comparison.csv


In [61]:
#load data
df = pd.read_csv('/content/crypto_comparison.csv')

In [63]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Market Cap,Coin
0,2014-09-17 00:00:00+00:00,465.864014,468.174011,452.421997,457.334015,21056800,0.0,0.0,9064983000.0,Bitcoin
1,2014-09-18 00:00:00+00:00,456.859985,456.859985,413.104004,424.440002,34483200,0.0,0.0,8412979000.0,Bitcoin
2,2014-09-19 00:00:00+00:00,424.102997,427.834991,384.532013,394.79599,37919700,0.0,0.0,7825394000.0,Bitcoin
3,2014-09-20 00:00:00+00:00,394.673004,423.29599,389.882996,408.903992,36863600,0.0,0.0,8105034000.0,Bitcoin
4,2014-09-21 00:00:00+00:00,408.084991,412.425995,393.181,398.821014,26580100,0.0,0.0,7905176000.0,Bitcoin


In [64]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Market Cap,Coin
13506,2025-02-04 00:00:00+00:00,0.285016,0.288467,0.256292,0.264144,4306969827,0.0,0.0,39073020000.0,Dogecoin
13507,2025-02-05 00:00:00+00:00,0.264128,0.269028,0.254372,0.256233,1818131454,0.0,0.0,37902800000.0,Dogecoin
13508,2025-02-06 00:00:00+00:00,0.256233,0.265695,0.243942,0.248084,1892595490,0.0,0.0,36697370000.0,Dogecoin
13509,2025-02-07 00:00:00+00:00,0.248079,0.261217,0.239544,0.246613,2042225149,0.0,0.0,36479780000.0,Dogecoin
13510,2025-02-08 00:00:00+00:00,0.246688,0.249611,0.24525,0.247758,1901072768,0.0,0.0,36649090000.0,Dogecoin


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13511 entries, 0 to 13510
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Date          13511 non-null  object 
 1   Open          13511 non-null  float64
 2   High          13511 non-null  float64
 3   Low           13511 non-null  float64
 4   Close         13511 non-null  float64
 5   Volume        13511 non-null  int64  
 6   Dividends     13511 non-null  float64
 7   Stock Splits  13511 non-null  float64
 8   Market Cap    13511 non-null  float64
 9   Coin          13511 non-null  object 
dtypes: float64(7), int64(1), object(2)
memory usage: 1.0+ MB


## Data cleaning

### Date column

In [67]:
# Convert the 'Date' column to datetime with UTC awareness.
# The parameter errors='raise' will throw an error if any value cannot be converted.
df['Date'] = pd.to_datetime(df['Date'], utc=True, errors='raise')

# Check the dtype to verify conversion
print("After conversion, Date dtype:", df['Date'].dtype)

After conversion, Date dtype: datetime64[ns, UTC]


In [68]:
# Remove timezone information, making the datetime values timezone-naive.
df['Date'] = df['Date'].dt.tz_localize(None)

In [69]:
# having the Date as the index allows for easy slicing, filtering, and applying time-based functions.
df.set_index('Date', inplace=True)

In [73]:
#Sorting the data chronologically is essential to preserve the temporal order
df.sort_index(inplace=True)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13511 entries, 2014-09-17 to 2025-02-08
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          13511 non-null  float64
 1   High          13511 non-null  float64
 2   Low           13511 non-null  float64
 3   Close         13511 non-null  float64
 4   Volume        13511 non-null  int64  
 5   Dividends     13511 non-null  float64
 6   Stock Splits  13511 non-null  float64
 7   Market Cap    13511 non-null  float64
 8   Coin          13511 non-null  object 
dtypes: float64(7), int64(1), object(1)
memory usage: 1.0+ MB


### Remove Unnecessary Columns

In [71]:
# Check if 'Dividends' and 'Stock Splits' columns contain only zeros, and drop them if so.
if (df['Dividends'] == 0).all():
    df.drop(columns=['Dividends'], inplace=True)
if (df['Stock Splits'] == 0).all():
    df.drop(columns=['Stock Splits'], inplace=True)

print("\nStep 5: Removed unnecessary columns (Dividends and Stock Splits) if they contained only zeros")
print(df.columns)


Step 5: Removed unnecessary columns (Dividends and Stock Splits) if they contained only zeros
Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap', 'Coin'], dtype='object')


In [72]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap,Coin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-09-17,465.864014,468.174011,452.421997,457.334015,21056800,9064983000.0,Bitcoin
2014-09-18,456.859985,456.859985,413.104004,424.440002,34483200,8412979000.0,Bitcoin
2014-09-19,424.102997,427.834991,384.532013,394.79599,37919700,7825394000.0,Bitcoin
2014-09-20,394.673004,423.29599,389.882996,408.903992,36863600,8105034000.0,Bitcoin
2014-09-21,408.084991,412.425995,393.181,398.821014,26580100,7905176000.0,Bitcoin


## Feature Engineering

### Date/Time Decomposition

In [74]:
df['Year']    = df.index.year
df['Month']   = df.index.month
df['Day']     = df.index.day
df['Quarter'] = df.index.quarter
df['Weekday'] = df.index.dayofweek

- **Extracting Temporal Components:** Breaking the date into components (year, month, day, quarter, weekday) to analyze seasonal trends and patterns.
For instance, certain months or days of the week tend to have higher volatility or trading volume.
- **Facilitating Aggregation:** These components make it easier to aggregate or segment the data for further analysis (e.g., average returns per quarter).

### Compute Price Returns

In [75]:
df['Daily_Return'] = df['Close'].pct_change()
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))

- **Daily Returns:** Calculating the percentage change from one day to the next normalizes the data and provides a direct measure of performance on a day-to-day basis.
- **Log Returns:** Logarithmic returns are additive over time and often used in finance because they handle compounding effects better and can make statistical properties (like normality) more apparent.
- **Risk and Performance Analysis:** Both return calculations are crucial for understanding the performance, risk, and volatility of the asset over time.

### Calculate Moving Averages


In [76]:
# Calculate the 7-day and 30-day EMA for the Close price
df['EMA_7']  = df['Close'].ewm(span=7, adjust=False).mean()
df['EMA_30'] = df['Close'].ewm(span=30, adjust=False).mean()

- Smoothing Short-Term Fluctuations: Moving averages help smooth out the “noise” in the data, revealing underlying trends by averaging out short-term fluctuations.
- Trend Identification: Short-term (7-day) and long-term (30-day) moving averages can indicate momentum and potential turning points in the price series.
- Signal Generation: In many trading strategies, crossovers between different moving averages are used as buy or sell signals.

In [77]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap,Coin,Year,Month,Day,Quarter,Weekday,Daily_Return,Log_Return,EMA_7,EMA_30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2014-09-17,465.864014,468.174011,452.421997,457.334015,21056800,9064983000.0,Bitcoin,2014,9,17,3,2,,,457.334015,457.334015
2014-09-18,456.859985,456.859985,413.104004,424.440002,34483200,8412979000.0,Bitcoin,2014,9,18,3,3,-0.071926,-0.074643,449.110512,455.211821
2014-09-19,424.102997,427.834991,384.532013,394.79599,37919700,7825394000.0,Bitcoin,2014,9,19,3,4,-0.069843,-0.072402,435.531881,451.314025
2014-09-20,394.673004,423.29599,389.882996,408.903992,36863600,8105034000.0,Bitcoin,2014,9,20,3,5,0.035735,0.035111,428.874909,448.577894
2014-09-21,408.084991,412.425995,393.181,398.821014,26580100,7905176000.0,Bitcoin,2014,9,21,3,6,-0.024659,-0.024968,421.361435,445.367773


In [78]:
# check missing values
df.isnull().sum()

Unnamed: 0,0
Open,0
High,0
Low,0
Close,0
Volume,0
Market Cap,0
Coin,0
Year,0
Month,0
Day,0


### missing values only occur at the very beginning

In [79]:
# Drop all rows with any missing values
df.dropna(inplace=True)

print("Missing values dropped. DataFrame shape:", df.shape)

Missing values dropped. DataFrame shape: (13510, 16)


In [80]:
#check missing values
df.isnull().sum()

Unnamed: 0,0
Open,0
High,0
Low,0
Close,0
Volume,0
Market Cap,0
Coin,0
Year,0
Month,0
Day,0


## Save cleaned Data

In [81]:
# Save the engineered DataFrame to a CSV file
output_filename = 'crypto_analysis_engineered.csv'
df.to_csv(output_filename)
print(f"Data saved as '{output_filename}'")

Data saved as 'crypto_analysis_engineered.csv'
