# Gold Stock Price Prediction Project
## Complete Machine Learning Pipeline with Multiple Algorithms

**Author:** CodeAj Marketplace  
**Dataset:** Kaggle Gold Stock Data  
**Objective:** Predict gold stock prices using various ML algorithms and compare their performance

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8-darkgrid')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Machine Learning Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Model persistence
import joblib

# Warnings
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

## 2. Load and Explore the Dataset

In [None]:
# Load dataset - Update path as per your file location
# If you have multiple files, combine them

# Option 1: Single file
df = pd.read_csv('dataset/goldstock v1.csv')

# Option 2: Multiple files (uncomment if needed)
# df1 = pd.read_csv('dataset/goldstock v1.csv')
# df2 = pd.read_csv('goldstock v2.csv')
# df2.rename(columns={'Close/Last': 'Close'}, inplace=True)
# df = pd.concat([df1, df2], ignore_index=True)

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

## 3. Data Preprocessing and Feature Engineering

In [None]:
# Create a copy for processing
data = df.copy()

# Remove unnamed column if exists
if 'Unnamed: 0' in data.columns:
    data.drop('Unnamed: 0', axis=1, inplace=True)

# Convert Date to datetime
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date').reset_index(drop=True)

# Feature Engineering - Create new features
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['Quarter'] = data['Date'].dt.quarter

# Technical Indicators
data['Price_Range'] = data['High'] - data['Low']
data['Price_Change'] = data['Close'] - data['Open']
data['Price_Change_Pct'] = ((data['Close'] - data['Open']) / data['Open']) * 100

# Moving averages (if sufficient data)
if len(data) >= 5:
    data['MA_5'] = data['Close'].rolling(window=5).mean()
    data['MA_10'] = data['Close'].rolling(window=10).mean() if len(data) >= 10 else data['Close'].rolling(window=5).mean()

# Lag features
data['Close_Lag1'] = data['Close'].shift(1)
data['Close_Lag2'] = data['Close'].shift(2)
data['Volume_Lag1'] = data['Volume'].shift(1)

# Drop rows with NaN values created by rolling/lag operations
data.dropna(inplace=True)

print("Data after feature engineering:")
print(data.head())
print("\nNew shape:", data.shape)
print("\nFeature columns:")
print(data.columns.tolist())

## 4. Exploratory Data Analysis (EDA) with Visualizations

In [None]:
# Set up the plotting style
plt.rcParams['figure.figsize'] = (15, 10)
sns.set_palette("husl")

# 1. Gold Stock Price Trend
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Price Trend
axes[0, 0].plot(data['Date'], data['Close'], color='gold', linewidth=2, marker='o')
axes[0, 0].set_title('Gold Stock Closing Price Over Time', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Close Price ($)')
axes[0, 0].grid(True, alpha=0.3)

# Volume Trend
axes[0, 1].bar(data['Date'], data['Volume'], color='skyblue', alpha=0.7)
axes[0, 1].set_title('Trading Volume Over Time', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Volume')
axes[0, 1].grid(True, alpha=0.3)

# Price Range (High-Low)
axes[1, 0].plot(data['Date'], data['Price_Range'], color='red', linewidth=2, marker='s')
axes[1, 0].set_title('Daily Price Range (High - Low)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Price Range ($)')
axes[1, 0].grid(True, alpha=0.3)

# Candlestick-like visualization
for i in range(len(data)):
    color = 'green' if data['Close'].iloc[i] > data['Open'].iloc[i] else 'red'
    axes[1, 1].plot([i, i], [data['Low'].iloc[i], data['High'].iloc[i]], color=color, linewidth=1)
    axes[1, 1].plot([i, i], [data['Open'].iloc[i], data['Close'].iloc[i]], color=color, linewidth=4)

axes[1, 1].set_title('Price Action (Open-Close-High-Low)', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Data Points')
axes[1, 1].set_ylabel('Price ($)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('eda_price_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("EDA visualizations created successfully!")

In [None]:
# Correlation Heatmap
plt.figure(figsize=(14, 10))
numeric_cols = data.select_dtypes(include=[np.number]).columns
correlation_matrix = data[numeric_cols].corr()

sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 10 features correlated with Close price:")
close_corr = correlation_matrix['Close'].sort_values(ascending=False)
print(close_corr.head(10))

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# Close price distribution
axes[0, 0].hist(data['Close'], bins=15, color='gold', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Close Price Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Price ($)')
axes[0, 0].set_ylabel('Frequency')

# Volume distribution
axes[0, 1].hist(data['Volume'], bins=15, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Volume Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Volume')
axes[0, 1].set_ylabel('Frequency')

# Price Change distribution
axes[0, 2].hist(data['Price_Change'], bins=15, color='green', edgecolor='black', alpha=0.7)
axes[0, 2].set_title('Price Change Distribution', fontweight='bold')
axes[0, 2].set_xlabel('Price Change ($)')
axes[0, 2].set_ylabel('Frequency')

# Box plots
axes[1, 0].boxplot([data['Open'], data['High'], data['Low'], data['Close']], 
                   labels=['Open', 'High', 'Low', 'Close'])
axes[1, 0].set_title('Price Statistics Box Plot', fontweight='bold')
axes[1, 0].set_ylabel('Price ($)')

# Price change percentage
axes[1, 1].plot(data['Date'], data['Price_Change_Pct'], color='purple', marker='o')
axes[1, 1].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[1, 1].set_title('Daily Price Change %', fontweight='bold')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Change %')
axes[1, 1].grid(True, alpha=0.3)

# Moving average comparison
if 'MA_5' in data.columns:
    axes[1, 2].plot(data['Date'], data['Close'], label='Close', linewidth=2)
    axes[1, 2].plot(data['Date'], data['MA_5'], label='MA 5', linewidth=2, linestyle='--')
    if 'MA_10' in data.columns:
        axes[1, 2].plot(data['Date'], data['MA_10'], label='MA 10', linewidth=2, linestyle='-.')
    axes[1, 2].set_title('Moving Averages', fontweight='bold')
    axes[1, 2].set_xlabel('Date')
    axes[1, 2].set_ylabel('Price ($)')
    axes[1, 2].legend()
    axes[1, 2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('distribution_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Prepare Data for Machine Learning

In [None]:
# Select features for modeling
feature_columns = ['Open', 'High', 'Low', 'Volume', 'Day', 'Month', 'Year', 
                   'DayOfWeek', 'Quarter', 'Price_Range', 'Price_Change',
                   'Price_Change_Pct', 'Close_Lag1', 'Close_Lag2', 'Volume_Lag1']

# Add MA features if they exist
if 'MA_5' in data.columns:
    feature_columns.append('MA_5')
if 'MA_10' in data.columns:
    feature_columns.append('MA_10')

# Prepare X and y
X = data[feature_columns]
y = data['Close']

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)
print("\nFeatures used:")
print(feature_columns)

# Split the data (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nData preprocessing completed successfully!")

## 6. Train Multiple Machine Learning Models

In [None]:
# Initialize models dictionary
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=5),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=5),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=3),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, max_depth=3),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'Support Vector Regressor': SVR(kernel='rbf', C=100, gamma=0.1),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=3)
}

# Dictionary to store results
results = {
    'Model': [],
    'Train_R2': [],
    'Test_R2': [],
    'Train_RMSE': [],
    'Test_RMSE': [],
    'Train_MAE': [],
    'Test_MAE': [],
    'CV_Score_Mean': [],
    'CV_Score_Std': []
}

# Dictionary to store trained models
trained_models = {}

print("Training {} models...\n".format(len(models)))
print("="*100)

# Train each model
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for models that benefit from scaling
    if name in ['Support Vector Regressor', 'K-Nearest Neighbors', 'Ridge Regression', 
                'Lasso Regression', 'ElasticNet', 'Linear Regression']:
        X_train_use = X_train_scaled
        X_test_use = X_test_scaled
    else:
        X_train_use = X_train
        X_test_use = X_test
    
    # Train the model
    model.fit(X_train_use, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_use)
    y_test_pred = model.predict(X_test_use)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Cross-validation (use smaller cv for small datasets)
    cv_scores = cross_val_score(model, X_train_use, y_train, cv=min(3, len(X_train)//2), 
                                scoring='r2')
    
    # Store results
    results['Model'].append(name)
    results['Train_R2'].append(train_r2)
    results['Test_R2'].append(test_r2)
    results['Train_RMSE'].append(train_rmse)
    results['Test_RMSE'].append(test_rmse)
    results['Train_MAE'].append(train_mae)
    results['Test_MAE'].append(test_mae)
    results['CV_Score_Mean'].append(cv_scores.mean())
    results['CV_Score_Std'].append(cv_scores.std())
    
    # Store trained model
    trained_models[name] = model
    
    # Print results
    print(f"Train R²: {train_r2:.4f} | Test R²: {test_r2:.4f}")
    print(f"Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
    print(f"Train MAE: {train_mae:.4f} | Test MAE: {test_mae:.4f}")
    print(f"CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print("-"*100)

print("\nAll models trained successfully!")

## 7. Model Performance Comparison and Analysis

In [None]:
# Create results dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test_R2', ascending=False).reset_index(drop=True)

print("\n" + "="*100)
print("MODEL PERFORMANCE COMPARISON")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

# Find best model
best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   Test R² Score: {results_df.iloc[0]['Test_R2']:.4f}")
print(f"   Test RMSE: {results_df.iloc[0]['Test_RMSE']:.4f}")
print(f"   Test MAE: {results_df.iloc[0]['Test_MAE']:.4f}")

# Save results to CSV
results_df.to_csv('model_comparison_results.csv', index=False)
print("\nResults saved to 'model_comparison_results.csv'")

## 8. Comprehensive Model Comparison Visualizations

In [None]:
# Create comprehensive comparison visualizations
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# 1. R² Score Comparison
x_pos = np.arange(len(results_df))
axes[0, 0].barh(x_pos, results_df['Test_R2'], color='skyblue', label='Test R²')
axes[0, 0].barh(x_pos, results_df['Train_R2'], alpha=0.5, color='orange', label='Train R²')
axes[0, 0].set_yticks(x_pos)
axes[0, 0].set_yticklabels(results_df['Model'], fontsize=9)
axes[0, 0].set_xlabel('R² Score', fontweight='bold')
axes[0, 0].set_title('Model R² Score Comparison', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3, axis='x')

# 2. RMSE Comparison
axes[0, 1].barh(x_pos, results_df['Test_RMSE'], color='lightcoral', label='Test RMSE')
axes[0, 1].barh(x_pos, results_df['Train_RMSE'], alpha=0.5, color='lightgreen', label='Train RMSE')
axes[0, 1].set_yticks(x_pos)
axes[0, 1].set_yticklabels(results_df['Model'], fontsize=9)
axes[0, 1].set_xlabel('RMSE (Lower is Better)', fontweight='bold')
axes[0, 1].set_title('Model RMSE Comparison', fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3, axis='x')

# 3. MAE Comparison
axes[1, 0].barh(x_pos, results_df['Test_MAE'], color='plum', label='Test MAE')
axes[1, 0].barh(x_pos, results_df['Train_MAE'], alpha=0.5, color='khaki', label='Train MAE')
axes[1, 0].set_yticks(x_pos)
axes[1, 0].set_yticklabels(results_df['Model'], fontsize=9)
axes[1, 0].set_xlabel('MAE (Lower is Better)', fontweight='bold')
axes[1, 0].set_title('Model MAE Comparison', fontsize=14, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3, axis='x')

# 4. Cross-Validation Score with Error Bars
axes[1, 1].barh(x_pos, results_df['CV_Score_Mean'], 
                xerr=results_df['CV_Score_Std'], 
                color='mediumpurple', 
                capsize=5,
                error_kw={'linewidth': 2, 'ecolor': 'red'})
axes[1, 1].set_yticks(x_pos)
axes[1, 1].set_yticklabels(results_df['Model'], fontsize=9)
axes[1, 1].set_xlabel('Cross-Validation R² Score', fontweight='bold')
axes[1, 1].set_title('Cross-Validation Performance', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.savefig('model_comparison_charts.png', dpi=300, bbox_inches='tight')
plt.show()

print("Model comparison visualizations created!")

In [None]:
# Additional visualization: Grouped bar chart
fig, ax = plt.subplots(figsize=(16, 8))

x = np.arange(len(results_df))
width = 0.35

bars1 = ax.bar(x - width/2, results_df['Train_R2'], width, label='Train R²', 
               color='#3498db', alpha=0.8)
bars2 = ax.bar(x + width/2, results_df['Test_R2'], width, label='Test R²', 
               color='#e74c3c', alpha=0.8)

ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('R² Score', fontsize=12, fontweight='bold')
ax.set_title('Train vs Test R² Score for All Models', fontsize=16, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'], rotation=45, ha='right')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('train_test_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Best Model Detailed Analysis

In [None]:
# Get predictions from best model
if best_model_name in ['Support Vector Regressor', 'K-Nearest Neighbors', 'Ridge Regression', 
                       'Lasso Regression', 'ElasticNet', 'Linear Regression']:
    y_train_pred_best = best_model.predict(X_train_scaled)
    y_test_pred_best = best_model.predict(X_test_scaled)
else:
    y_train_pred_best = best_model.predict(X_train)
    y_test_pred_best = best_model.predict(X_test)

# Create detailed analysis visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Actual vs Predicted (Training)
axes[0, 0].scatter(y_train, y_train_pred_best, alpha=0.6, s=100, color='blue', edgecolors='black')
axes[0, 0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 
                'r--', lw=3, label='Perfect Prediction')
axes[0, 0].set_xlabel('Actual Price ($)', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Predicted Price ($)', fontsize=12, fontweight='bold')
axes[0, 0].set_title(f'{best_model_name} - Training Set\nR² = {r2_score(y_train, y_train_pred_best):.4f}', 
                     fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Actual vs Predicted (Testing)
axes[0, 1].scatter(y_test, y_test_pred_best, alpha=0.6, s=100, color='green', edgecolors='black')
axes[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                'r--', lw=3, label='Perfect Prediction')
axes[0, 1].set_xlabel('Actual Price ($)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Predicted Price ($)', fontsize=12, fontweight='bold')
axes[0, 1].set_title(f'{best_model_name} - Test Set\nR² = {r2_score(y_test, y_test_pred_best):.4f}', 
                     fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Residuals (Training)
train_residuals = y_train - y_train_pred_best
axes[1, 0].scatter(y_train_pred_best, train_residuals, alpha=0.6, s=100, color='purple', edgecolors='black')
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=3)
axes[1, 0].set_xlabel('Predicted Price ($)', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Residuals ($)', fontsize=12, fontweight='bold')
axes[1, 0].set_title(f'Residual Plot - Training Set', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# 4. Residuals (Testing)
test_residuals = y_test - y_test_pred_best
axes[1, 1].scatter(y_test_pred_best, test_residuals, alpha=0.6, s=100, color='orange', edgecolors='black')
axes[1, 1].axhline(y=0, color='r', linestyle='--', lw=3)
axes[1, 1].set_xlabel('Predicted Price ($)', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Residuals ($)', fontsize=12, fontweight='bold')
axes[1, 1].set_title(f'Residual Plot - Test Set', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('best_model_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Best model ({best_model_name}) detailed analysis completed!")

In [None]:
# Error distribution analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Training error distribution
axes[0].hist(train_residuals, bins=15, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero Error')
axes[0].set_xlabel('Residuals ($)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title(f'{best_model_name} - Training Error Distribution', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Testing error distribution
axes[1].hist(test_residuals, bins=10, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2, label='Zero Error')
axes[1].set_xlabel('Residuals ($)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[1].set_title(f'{best_model_name} - Test Error Distribution', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('error_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Feature Importance Analysis (for tree-based models)

In [None]:
# Feature importance for tree-based models
tree_based_models = ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'AdaBoost']

if best_model_name in tree_based_models:
    # Get feature importance
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': feature_columns,
            'Importance': best_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        # Plot feature importance
        plt.figure(figsize=(12, 8))
        plt.barh(range(len(feature_importance)), feature_importance['Importance'], 
                 color='teal', alpha=0.8, edgecolor='black')
        plt.yticks(range(len(feature_importance)), feature_importance['Feature'])
        plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
        plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold', pad=20)
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("\nTop 10 Most Important Features:")
        print(feature_importance.head(10).to_string(index=False))
else:
    print(f"\nFeature importance is not available for {best_model_name}")
    print("This analysis is only available for tree-based models.")

## 11. Prediction Timeline Visualization

In [None]:
# Create a comprehensive prediction timeline
# Combine train and test indices with their dates
train_indices = X_train.index
test_indices = X_test.index

train_dates = data.loc[train_indices, 'Date']
test_dates = data.loc[test_indices, 'Date']

# Create visualization
plt.figure(figsize=(16, 8))

# Plot actual values
plt.plot(train_dates, y_train, 'o-', label='Training Actual', 
         color='blue', linewidth=2, markersize=8, alpha=0.7)
plt.plot(test_dates, y_test, 'o-', label='Testing Actual', 
         color='green', linewidth=2, markersize=8, alpha=0.7)

# Plot predictions
plt.plot(train_dates, y_train_pred_best, 's--', label='Training Predicted', 
         color='lightblue', linewidth=2, markersize=6, alpha=0.9)
plt.plot(test_dates, y_test_pred_best, 's--', label='Testing Predicted', 
         color='lightgreen', linewidth=2, markersize=6, alpha=0.9)

plt.xlabel('Date', fontsize=12, fontweight='bold')
plt.ylabel('Gold Stock Price ($)', fontsize=12, fontweight='bold')
plt.title(f'Gold Stock Price Prediction Timeline - {best_model_name}', 
          fontsize=16, fontweight='bold', pad=20)
plt.legend(fontsize=11, loc='best')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('prediction_timeline.png', dpi=300, bbox_inches='tight')
plt.show()

print("Prediction timeline visualization created!")

## 12. Model Performance Summary Table

In [None]:
# Create a detailed performance summary
print("\n" + "="*120)
print("COMPREHENSIVE MODEL PERFORMANCE SUMMARY")
print("="*120)

summary_data = []
for idx, row in results_df.iterrows():
    model_name = row['Model']
    summary_data.append({
        'Rank': idx + 1,
        'Model': model_name,
        'Test R²': f"{row['Test_R2']:.4f}",
        'Test RMSE': f"{row['Test_RMSE']:.2f}",
        'Test MAE': f"{row['Test_MAE']:.2f}",
        'Train R²': f"{row['Train_R2']:.4f}",
        'Overfitting': 'Yes' if row['Train_R2'] - row['Test_R2'] > 0.1 else 'No',
        'CV Mean': f"{row['CV_Score_Mean']:.4f}",
        'CV Std': f"{row['CV_Score_Std']:.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print("="*120)

# Additional statistics
print("\n📊 KEY INSIGHTS:")
print(f"   • Best Model: {best_model_name}")
print(f"   • Highest Test R²: {results_df['Test_R2'].max():.4f}")
print(f"   • Lowest Test RMSE: ${results_df['Test_RMSE'].min():.2f}")
print(f"   • Lowest Test MAE: ${results_df['Test_MAE'].min():.2f}")
print(f"   • Average Test R² across all models: {results_df['Test_R2'].mean():.4f}")
print(f"   • Models with R² > 0.8: {len(results_df[results_df['Test_R2'] > 0.8])}")

# Save summary
summary_df.to_csv('model_performance_summary.csv', index=False)
print("\n✅ Summary saved to 'model_performance_summary.csv'")

## 13. Save the Best Model

In [None]:
# Save the best model and scaler
model_filename = f'best_model_{best_model_name.replace(" ", "_").lower()}.pkl'
scaler_filename = 'feature_scaler.pkl'

joblib.dump(best_model, model_filename)
joblib.dump(scaler, scaler_filename)

print(f"\n✅ Best model saved as: {model_filename}")
print(f"✅ Feature scaler saved as: {scaler_filename}")

# Save feature names for future use
with open('feature_names.txt', 'w') as f:
    for feature in feature_columns:
        f.write(f"{feature}\n")

print("✅ Feature names saved as: feature_names.txt")

print("\n" + "="*100)
print("MODEL TRAINING AND EVALUATION COMPLETE!")
print("="*100)
print("\n📁 Generated Files:")
print(f"   1. {model_filename} - Best trained model")
print(f"   2. {scaler_filename} - Feature scaler")
print("   3. feature_names.txt - List of features used")
print("   4. model_comparison_results.csv - Detailed results")
print("   5. model_performance_summary.csv - Performance summary")
print("   6. Multiple PNG visualization files")
print("\n🎯 Next Steps:")
print("   • Use the saved model for predictions on new data")
print("   • Deploy the model in a Flask/Django web application")
print("   • Create an API endpoint for real-time predictions")
print("="*100)

## 14. Model Usage Example - Making Predictions

In [None]:
# Example: How to use the saved model for predictions
print("\n" + "="*100)
print("EXAMPLE: MAKING PREDICTIONS WITH THE SAVED MODEL")
print("="*100)

# Load the saved model and scaler
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)

# Example: Create a sample input
sample_input = X_test.iloc[0:1].copy()  # Take first test sample
actual_price = y_test.iloc[0]

print("\nSample Input Features:")
print(sample_input.T)

# Scale the input if needed
if best_model_name in ['Support Vector Regressor', 'K-Nearest Neighbors', 'Ridge Regression', 
                       'Lasso Regression', 'ElasticNet', 'Linear Regression']:
    sample_scaled = loaded_scaler.transform(sample_input)
    prediction = loaded_model.predict(sample_scaled)[0]
else:
    prediction = loaded_model.predict(sample_input)[0]

print(f"\n🎯 Prediction Results:")
print(f"   Actual Price: ${actual_price:.2f}")
print(f"   Predicted Price: ${prediction:.2f}")
print(f"   Difference: ${abs(actual_price - prediction):.2f}")
print(f"   Error Percentage: {abs((actual_price - prediction) / actual_price * 100):.2f}%")
print("="*100)

## 15. Final Summary and Recommendations

In [None]:
print("\n" + "#"*120)
print("#" + " "*118 + "#")
print("#" + " "*38 + "GOLD STOCK PRICE PREDICTION PROJECT SUMMARY" + " "*37 + "#")
print("#" + " "*118 + "#")
print("#"*120)

print("\n🎯 PROJECT OBJECTIVES ACHIEVED:")
print("   ✓ Loaded and preprocessed gold stock data from Kaggle")
print("   ✓ Performed comprehensive exploratory data analysis")
print("   ✓ Created advanced features using feature engineering")
print(f"   ✓ Trained and compared {len(models)} different ML algorithms")
print("   ✓ Generated comprehensive visualizations")
print("   ✓ Identified best performing model")
print("   ✓ Saved model for production use")

print("\n📈 MODELS EVALUATED:")
for i, model_name in enumerate(models.keys(), 1):
    print(f"   {i}. {model_name}")

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"   • Test R² Score: {results_df.iloc[0]['Test_R2']:.4f}")
print(f"   • Test RMSE: ${results_df.iloc[0]['Test_RMSE']:.2f}")
print(f"   • Test MAE: ${results_df.iloc[0]['Test_MAE']:.2f}")
print(f"   • Cross-Validation Score: {results_df.iloc[0]['CV_Score_Mean']:.4f} (+/- {results_df.iloc[0]['CV_Score_Std']:.4f})")

print("\n📊 VISUALIZATIONS CREATED:")
viz_files = [
    'eda_price_analysis.png',
    'correlation_heatmap.png',
    'distribution_analysis.png',
    'model_comparison_charts.png',
    'train_test_comparison.png',
    'best_model_analysis.png',
    'error_distribution.png',
    'prediction_timeline.png'
]
if best_model_name in tree_based_models:
    viz_files.append('feature_importance.png')

for i, viz in enumerate(viz_files, 1):
    print(f"   {i}. {viz}")

print("\n💡 RECOMMENDATIONS FOR DEPLOYMENT:")
print("   1. Integrate model into Flask/Django web application")
print("   2. Create REST API endpoints for predictions")
print("   3. Implement real-time data fetching from stock APIs")
print("   4. Add model monitoring and retraining pipeline")
print("   5. Deploy on cloud platform (AWS/Azure/GCP)")
print("   6. Implement user authentication and dashboard")
print("   7. Add prediction confidence intervals")
print("   8. Create mobile-responsive UI with Bootstrap 5")

print("\n🚀 FUTURE ENHANCEMENTS:")
print("   • Collect more historical data for better accuracy")
print("   • Implement LSTM/GRU for time series modeling")
print("   • Add sentiment analysis from news articles")
print("   • Include economic indicators as features")
print("   • Implement ensemble methods combining top models")
print("   • Add automated hyperparameter tuning")
print("   • Create prediction intervals and uncertainty estimates")

print("\n" + "#"*120)
print("#" + " "*40 + "Thank you for using CodeAj Marketplace!" + " "*39 + "#")
print("#" + " "*118 + "#")
print("#"*120)
print("\n✅ Notebook execution completed successfully!\n")