<a href="https://colab.research.google.com/github/blegried93/New-Projects/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Set plotting style
plt.style.use("ggplot")

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

print("Default setup loaded!")

In [None]:
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import LabelEncoder as le
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier as xgbC
from xgboost import XGBRegressor as xgbR




In [None]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target

df.head()


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score # Import mean_squared_error and r2_score

# Function to train and save predictions
#Test_size is the complement of the training_size
def train_and_save_predictions(test_size):
    # Load data
    data = fetch_california_housing()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['MedHouseVal'] = data.target

    # Split data
    X_train, X_test, y_train, y_test = tts(df.drop('MedHouseVal', axis=1), df['MedHouseVal'],
                                            test_size=test_size, random_state=42)

    # Train model
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)
    comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
    error1 = compute_error_metrics(y_test,predictions)

    # Create the 'data' directory if it doesn't exist
    output_dir = '../data'  # Directory for saving predictions
    os.makedirs(output_dir, exist_ok=True)

    # Save results
    filename = os.path.join(output_dir, f'predictions_testsize_{test_size}.csv')
    comparison_df.to_csv(filename, index=False)
    print(f"mse, rmse, r2 are {error1}")
    print(f"Predictions saved to {filename}")

if __name__ == "__main__":
    train_and_save_predictions(test_size)


In [80]:
#prints predictions from training
def print_predictions(test_size):
    filename = f'../data/predictions_testsize_{test_size}.csv'
    df = pd.read_csv(filename)
    print(f"\nPredictions for test_size={test_size}:\n")
    print(df.head())

if __name__ == "__main__":
    print_predictions(test_size)




Predictions for test_size=0.3:

    Actual  Predicted
0  0.47700   0.610555
1  0.45800   0.550802
2  5.00001   4.765771
3  2.18600   2.568251
4  2.78000   2.404391


In [None]:

def plot_results(test_size):
    filename = f'../data/predictions_testsize_{test_size}.csv'
    df = pd.read_csv(filename)

    # Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(df['Actual'], df['Predicted'], alpha=0.5)
    plt.plot([df['Actual'].min(), df['Actual'].max()],
             [df['Actual'].min(), df['Actual'].max()], color='red')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted Values (test_size={test_size})')
    plt.grid()

    # Save plot
    plt.show()
    plot_filename = f'../data/plot_testsize_{test_size}.png'
    plt.savefig(plot_filename)
    plt.close()
    print(f"Plot saved to {plot_filename}")

if __name__ == "__main__":
    plot_results(test_size)


In [93]:
def compute_error_metrics(y_true,y_pred):
  mse = mean_squared_error(y_true,y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_true,y_pred)

  return mse, rmse, r2

In [None]:
import subprocess

#Driver file for the previous 3 steps.
def run_experiment(test_size):
    print(f"\nRunning experiment with test_size={test_size}...\n")
    train_and_save_predictions(test_size)
    print_predictions(test_size)
    plot_results(test_size)

if __name__ == "__main__":
    test_sizes = [0.1, 0.5, 0.9]
    for test_size in test_sizes:
        run_experiment(test_size)