In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Data Collection & Preprocessing**

In [None]:
import pandas as pd

# Load the daily Ethereum price data
file_path = "/kaggle/input/ethereum-price-data-binance-api-2017now/eth_1d_data_2017_to_2025.csv"
eth_data = pd.read_csv("/kaggle/input/ethereum-price-data-binance-api-2017now/eth_15m_data_2017_to_2025.csv")
eth_data = pd.read_csv("/kaggle/input/ethereum-price-data-binance-api-2017now/eth_1d_data_2017_to_2025.csv")
eth_data = pd.read_csv("/kaggle/input/ethereum-price-data-binance-api-2017now/eth_1h_data_2017_to_2025.csv")
eth_data = pd.read_csv("/kaggle/input/ethereum-price-data-binance-api-2017now/eth_4h_data_2017_to_2025.csv")

# Preview the data
print(eth_data.head())
print(eth_data.columns)


**Clean & Prepare**

In [None]:
import pandas as pd

# STEP 1: Load the data — only once (choose either 15m or 1d dataset, here using 1d)
file_path = "/kaggle/input/ethereum-price-data-binance-api-2017now/eth_1d_data_2017_to_2025.csv"
eth_data = pd.read_csv(file_path)

# STEP 2: Normalize column names (strip spaces and convert to lowercase for consistency)
eth_data.columns = eth_data.columns.str.strip().str.lower()

# STEP 3: Confirm actual column names
print("Normalized column names:", eth_data.columns.tolist())

# STEP 4: Convert 'open time' to datetime
if 'open time' in eth_data.columns:
    eth_data['open time'] = pd.to_datetime(eth_data['open time'])
else:
    raise KeyError("Column 'open time' not found. Check the CSV column names.")

# STEP 5: Set 'open time' as the index and sort
eth_data.set_index('open time', inplace=True)
eth_data.sort_index(inplace=True)

# STEP 6: Convert 'close' to numeric and clean NaNs
if 'close' in eth_data.columns:
    eth_data['close'] = pd.to_numeric(eth_data['close'], errors='coerce')
    eth_data.dropna(subset=['close'], inplace=True)
    eth_data = eth_data.ffill()
else:
    raise KeyError("Column 'close' not found. Check the CSV column names.")

# STEP 7: Confirm the cleaned data
print(f"\n✅ Date Range: {eth_data.index.min()} to {eth_data.index.max()}")
print("\n📈 Data Preview:")
print(eth_data[['close']].head())







**Normalized and Cleaning)**


In [None]:
import pandas as pd

# Step 1: Load the dataset
file_path = "/kaggle/input/ethereum-price-data-binance-api-2017now/eth_1d_data_2017_to_2025.csv"
eth_data = pd.read_csv("/kaggle/input/ethereum-price-data-binance-api-2017now/eth_15m_data_2017_to_2025.csv")

# Step 2: Normalize column names by stripping extra whitespace and lowering case
eth_data.columns = eth_data.columns.str.strip().str.lower()

# Print column names to confirm
print("Normalized column names:", eth_data.columns.tolist())
# Step 3: Convert datetime
eth_data['open time'] = pd.to_datetime(eth_data['open time'])

# Step 4: Set datetime as index and sort
eth_data.set_index('open time', inplace=True)
eth_data.sort_index(inplace=True)

# Step 5: Ensure 'close' is numeric and clean data
eth_data['close'] = pd.to_numeric(eth_data['close'], errors='coerce')
eth_data.dropna(subset=['close'], inplace=True)
eth_data = eth_data.ffill()

# Preview
print("\nCleaned Data Preview:")
print(eth_data[['close']].head())



**Exploratory Data Analysis (EDA)**

In [None]:
import matplotlib.pyplot as plt

# Plot the close price
plt.figure(figsize=(14, 6))
plt.plot(eth_data['close'], label='ETH Closing Price', color='blue')
plt.title('Ethereum (ETH) Daily Closing Price')
plt.xlabel('Date')
plt.ylabel('Price (USDT)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Rolling mean & std
eth_data['close'].rolling(30).mean().plot(label='30-Day MA', figsize=(14, 6))
eth_data['close'].rolling(30).std().plot(label='30-Day STD')
plt.title("Rolling Mean and Std Dev")
plt.legend()
plt.grid()
plt.show()
# Volume trend
eth_data['volume'].plot(figsize=(14, 4), title="Ethereum Daily Trading Volume", color='orange')
plt.grid()
plt.show()
 #Summary stats
print(eth_data[['close', 'volume']].describe())


**Stationarity Testing**

In [None]:
from statsmodels.tsa.stattools import adfuller

# ADF Test
adf_result = adfuller(eth_data['close'])

print("\nADF Test Results:")
print("ADF Statistic:", adf_result[0])
print("p-value:", adf_result[1])
print("Critical Values:", adf_result[4])

# If p > 0.05, differencing is needed


**Apply Differencing if Needed**

In [None]:
eth_data['close_diff'] = eth_data['close'].diff()
eth_data.dropna(inplace=True)

# Re-run ADF on differenced series
adf_diff = adfuller(eth_data['close_diff'])
print("\nDifferenced ADF Test:")
print("ADF Statistic:", adf_diff[0])
print("p-value:", adf_diff[1])


**ARIMA Model Development**

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(eth_data['close_diff'], lags=40)
plot_pacf(eth_data['close_diff'], lags=40)
plt.show()


**Train ARIMA**

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model — start with (p=2, d=1, q=2) as a baseline
model = ARIMA(eth_data['close'], order=(2, 1, 2))
model_fit = model.fit()
# Infer frequency and assign it
eth_data = eth_data.asfreq(pd.infer_freq(eth_data.index))
eth_data = eth_data.asfreq('D')  # 'D' = Daily, 'H' = Hourly, '15min' = 15 minutes, etc.
# Assuming 'open time' is already the index and datetime
eth_data = eth_data.asfreq('D')  # Or infer it: eth_data = eth_data.asfreq(pd.infer_freq(eth_data.index))


# Summary
print(model_fit.summary())


****Model Evaluation****

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd

# Make forecast — assuming 30 days
forecast_steps = 30
pred = model_fit.forecast(steps=forecast_steps)

# Get last 30 actual values (excluding the differencing if d=1 in ARIMA)
actual = eth_data['close'].iloc[-forecast_steps:]

# Make sure both are the same length
print(f"Actual length: {len(actual)}, Predicted length: {len(pred)}")

# Plot actual vs predicted with custom colors
plt.figure(figsize=(12, 4)) 
plt.plot(eth_data['close'], label='Actual Price', color='dodgerblue', linewidth=2)      # Blue for actual
plt.plot(pred, label='Predicted Price', color='darkorange', linestyle='--', linewidth=2) # Orange dashed for prediction

plt.legend()
plt.title("Actual vs ARIMA Predicted Ethereum (ETH) Price")
plt.xlabel("Date")
plt.ylabel("Price (USDT)")
plt.grid(True)
plt.tight_layout()
plt.show()


# Create DataFrame
comparison_df = pd.DataFrame({
    'actual': actual.values,
    'predicted': pred.values
})

# Drop rows with NaN
comparison_df.dropna(inplace=True)

# Check if we have enough data left
if comparison_df.empty:
    raise ValueError("comparison_df is empty after dropping NaNs. Check your forecast and actual alignment.")
    
# Calculate metrics
rmse = np.sqrt(mean_squared_error(comparison_df['actual'], comparison_df['predicted']))
mape = mean_absolute_percentage_error(comparison_df['actual'], comparison_df['predicted'])

print("✅ RMSE:", rmse)
print("✅ MAPE:", mape)
# Forecast same length as historical to compare
pred = model_fit.predict(start=0, end=len(eth_data)-1, typ='levels')



In [None]:
# Forecast 30 future days
forecast = model_fit.get_forecast(steps=30)
forecast_df = forecast.conf_int()
forecast_df['forecast'] = forecast.predicted_mean
forecast_df.index = pd.date_range(start=eth_data.index[-1] + pd.Timedelta(days=1), periods=30)

# Plot forecast
plt.figure(figsize=(10, 4))
plt.plot(eth_data['close'], label='Historical')
plt.plot(forecast_df['forecast'], label='Forecast', color='Brown')
plt.fill_between(forecast_df.index, forecast_df['lower close'], forecast_df['upper close'], color='lightgreen', alpha=0.5)
plt.title("30-Day Ethereum Price Forecast with ARIMA")
plt.legend()
plt.grid()
plt.show()
