In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the Excel file
file_path = '/kaggle/input/gpdatabase1/GPDatabase1.xlsx'
excel_data = pd.ExcelFile(file_path)

# Load the PackageSeason, TourPackage, and Season sheets
package_season_data = excel_data.parse('PackageSeason')
tour_package_data = excel_data.parse('TourPackage')
season_data = excel_data.parse('Season')

# Merge data for demand prediction
demand_data = package_season_data.merge(tour_package_data, on='PackageId').merge(season_data, on='SeasonId')

# Filter data for years 2019 to 2024 for training and testing
demand_data_train = demand_data[demand_data['Year'].between(2019, 2024)]

# Select features and target variable for demand prediction
X = demand_data_train[['PackageId', 'SeasonId', 'Year', 'Cost', 'Price', 'Dmin', 'Dmax']]
y = demand_data_train['Demand']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets for demand prediction
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models and their hyperparameters for GridSearchCV
models = {
    'Gradient Boosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    'Random Forest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Support Vector Regressor': {
        'model': SVR(),
        'params': {
            'kernel': ['linear', 'poly', 'rbf'],
            'C': [0.1, 1, 10]
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Neural Network': {
        'model': MLPRegressor(random_state=42, max_iter=1000),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['tanh', 'relu'],
            'solver': ['sgd', 'adam'],
            'alpha': [0.0001, 0.001]
        }
    }
}

# Train and evaluate each model
results = {}
for name, model_info in models.items():
    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=3, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    accuracy = r2 * 100
    
    results[name] = {
        'model': best_model,
        'mse': mse,
        'r2': r2,
        'accuracy': accuracy,
        'y_pred': y_pred
    }
    
    print(f'{name}:')
    print(f'  Mean Squared Error: {mse}')
    print(f'  R-squared: {r2}')
    print(f'  Accuracy: {accuracy:.2f}%')
    print(f'  Best Hyperparameters: {grid_search.best_params_}')
    print()

# Plotting the results
plt.figure(figsize=(14, 8))

# Mean Squared Error
plt.subplot(2, 2, 1)
mse_values = [results[name]['mse'] for name in models]
plt.bar(models.keys(), mse_values)
plt.ylabel('Mean Squared Error')
plt.title('Mean Squared Error of Different Models')
plt.xticks(rotation=45)

# R-squared
plt.subplot(2, 2, 2)
r2_values = [results[name]['r2'] for name in models]
plt.bar(models.keys(), r2_values)
plt.ylabel('R-squared')
plt.title('R-squared of Different Models')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Plot actual vs predicted demand for each model
plt.figure(figsize=(15, 10))

for i, (name, result) in enumerate(results.items(), 1):
    plt.subplot(3, 3, i)
    plt.scatter(y_test, result['y_pred'], alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
    plt.xlabel('Actual Demand')
    plt.ylabel('Predicted Demand')
    plt.title(f'Actual vs Predicted Demand - {name}')
    plt.grid(True)

plt.tight_layout()
plt.show()