In [45]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For data visualization
from sklearn.model_selection import train_test_split  # For splitting the data into training and testing sets
from sklearn.linear_model import LinearRegression  # For building the regression model
from sklearn.metrics import mean_squared_error, r2_score  # For evaluating the model
import statsmodels.api as sm  # For statistical modeling and hypothesis testing (optional)
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [46]:
train_df = pd.read_csv('/Users/blairjdaniel/kaggle_comps/kaggle_predict_podcast/processed/train_pca.csv')
test_df = pd.read_csv('/Users/blairjdaniel/kaggle_comps/kaggle_predict_podcast/processed/test_pca.csv')

In [61]:
# Select a small sample of the data
#sample_df = train_df.sample(frac=0.1, random_state=42)  # Adjust `frac` for the desired sample size

# # Define features (PC1, PC5, PC7) and target
# X = train_df[['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']]
# y = test_df['listening_time_minutes']

# Define features and target
X = train_df.drop(columns=['listening_time_minutes'])
y = train_df['listening_time_minutes']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42)

param_grid = {
    'n_estimators': [200],
    'learning_rate': [0.1],
    'max_depth': [7],
    'subsample': [1.0],
    'colsample_bytree': [0.8],
    'gamma': [5],
    'reg_alpha': [1],
    'reg_lambda': [1]
}

# Initialize the XGBoost Regressor
xgb_model = XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='r2',  # Use R² as the evaluation metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1  # Use all available cores
)

grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

# Use the best model
best_model = grid_search.best_estimator_  # or random_search.best_estimator_

# Make predictions on the validation set
y_pred = best_model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.1, max_depth=7, n_estimators=200, reg_alpha=1, reg_lambda=1, subsample=1.0; total time=   9.5s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.1, max_depth=7, n_estimators=200, reg_alpha=1, reg_lambda=1, subsample=1.0; total time=   9.5s
[CV] END colsample_bytree=0.8, gamma=5, learning_rate=0.1, max_depth=7, n_estimators=200, reg_alpha=1, reg_lambda=1, subsample=1.0; total time=   9.6s
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 5, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 1.0}
Best R² Score: 0.7439595444873465
Root Mean Squared Error (RMSE): 13.40
Mean Squared Error (MSE): 179.50
R² Score: 0.75


In [64]:
# Load the test dataset
test_df = pd.read_csv('/Users/blairjdaniel/kaggle_comps/kaggle_predict_podcast/processed/test_pca.csv')

# Ensure the test dataset has the same features as the training dataset
X_test = test_df.drop(columns=['listening_time_minutes'], errors='ignore')  # Drop target if it exists

# Predict listening_time_minutes using the trained model
test_predictions = best_model.predict(X_test)

# Create a DataFrame with 'id' and predicted 'listening_time_minutes'
# Assuming the original test.csv has an 'id' column
original_test_df = pd.read_csv('/Users/blairjdaniel/kaggle_comps/kaggle_predict_podcast/files/test.csv')
output_df = pd.DataFrame({
    'id': original_test_df['id'],  # Use the 'id' column from the original test.csv
    'listening_time_minutes': test_predictions
})

# Save the predictions to a CSV file
output_df.to_csv('/Users/blairjdaniel/kaggle_comps/kaggle_predict_podcast/submissions/predictions.csv', index=False)

# Print a preview of the output
output_df.head()

Unnamed: 0,id,listening_time_minutes
0,750000,56.33868
1,750001,17.557547
2,750002,60.140491
3,750003,103.535515
4,750004,49.759602
