## Normal K-fold by 5

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load the data
inflation_data_path = 'datasets/expanded2_usa_inflation_rate.csv'
bitcoin_data_path = 'datasets/data_btc.csv'

inflation_data = pd.read_csv(inflation_data_path)
bitcoin_data = pd.read_csv(bitcoin_data_path)

# Preprocessing
inflation_data['date'] = pd.to_datetime(inflation_data['date'], dayfirst=True)
bitcoin_data['date'] = pd.to_datetime(bitcoin_data['date'], dayfirst=True)

# Merging the data on the 'date' column
merged_data = pd.merge(bitcoin_data, inflation_data, on='date', how='inner')
merged_data_final = merged_data[['date', 'price', 'total_volumes', 'Monthly Inflation Rate', 'Annual Inflation Rate']]

# Handling NaN values by filling them with the mean of their respective columns
merged_data_final_filled = merged_data_final.fillna(merged_data_final.mean(numeric_only=True))

# Features and target variable
X = merged_data_final_filled[['Monthly Inflation Rate', 'total_volumes']]
y = merged_data_final_filled['price']

# K-Fold setup
kf = KFold(n_splits=5, random_state=60, shuffle=True)
model = LinearRegression()

# Perform cross-validation and store predictions for each fold
predictions = cross_val_predict(model, X, y, cv=kf)

# Convert dates to datetime objects
dates = merged_data_final_filled['date']
dates = pd.to_datetime(dates)

# Merging the data on the 'date' column
merged_data = pd.merge(bitcoin_data, inflation_data, on='date', how='inner') # merge the datas by same date.
merged_data_final = merged_data[['price', 'Monthly Inflation Rate', 'total_volumes']]

# Plotting correlation matrix heatmap
plt.figure(figsize=(8, 6))
correlation_matrix = merged_data_final.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 12})
plt.title('Correlation Matrix Heatmap')
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'datasets/expanded2_usa_inflation_rate.csv'

In [None]:
# Initialize lists to store performance metrics for each fold
r2_scores = []
rmse_scores = []

# Iterate over each fold
for train_index, test_index in kf.split(X):
    # Split data into train and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate R-squared score
    r2 = r2_score(y_test, y_pred)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Append scores to the respective lists
    r2_scores.append(r2)
    rmse_scores.append(rmse)

# Print the R-squared scores and RMSE for each fold
for i in range(len(r2_scores)):
    print(f"Fold {i+1}: R-squared = {r2_scores[i]:.4f}, RMSE = {rmse_scores[i]:.4f}")

# Calculate average R-squared score and RMSE across all folds
avg_r2 = np.mean(r2_scores)
avg_rmse = np.mean(rmse_scores)
print("\nAverage R-squared across all folds:", avg_r2)
print("Average RMSE across all folds:", avg_rmse)


In [None]:
# Scatter plot for each fold's predictions
plt.figure(figsize=(5, 20))

for i, (train_index, test_index) in enumerate(kf.split(X), 1):
    plt.subplot(6, 1, i)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    fold_pred = model.predict(X_test)
    plt.scatter(y_test, fold_pred, label=f'Fold {i}', color='blue', alpha=0.5)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Identity line
    plt.title(f'Fold {i}')
    plt.xlabel('Actual Price')
    plt.ylabel('Predicted Price')
    plt.legend()

# Scatter plot for average predictions
plt.subplot(6, 1, 6)
plt.scatter(y, predictions, label='Average Predicted', color='green', alpha=0.5)
plt.plot([min(y), max(y)], [min(y), max(y)], color='red', linestyle='--')  # Identity line
plt.title('Average Predicted')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Plotting predicted prices for each fold
plt.figure(figsize=(10, 22))

for i, (train_index, test_index) in enumerate(kf.split(X), 1):
    plt.subplot(6, 1, i)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    fold_pred = model.predict(X_test)
    plt.scatter(dates[test_index], y.iloc[test_index], label='Actual', color='blue', alpha=0.7)
    plt.plot(dates[test_index], fold_pred, linestyle='--', color='red', label='Predicted')
    plt.title(f'Fold {i}')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()

# Plotting average predicted values
plt.subplot(6, 1, 6)
plt.plot(dates, y, label='Actual', color='blue')
plt.plot(dates, predictions, label='Average Predicted', color='red', linestyle='--')
plt.title('Average Predicted')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()

plt.tight_layout()
plt.show()