In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
print(Path.cwd()) 

# Scikit-learn Modules
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# XGBoost
import xgboost as xgb

# PyTorch Modules
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

/Users/abdurrahman/Documents/Data Bootcamp/Finals/Predicting_Telework_Hours/notebooks


In [None]:
print("Initial Working Directory:", Path.cwd())

# Define the project root explicitly
project_root = Path("/Users/abdurrahman/Documents/Data Bootcamp/Finals/Predicting_Telework_Hours")
os.chdir(project_root)
print("New Working Directory:", os.getcwd())

# Set path to processed data
processed_data_path = project_root / "data" / "processed" / "merged_emp_df_encoded.csv"

# Verify the existence of the processed data file
if not processed_data_path.exists():
    raise FileNotFoundError(f"Processed data file does not exist: {processed_data_path}")

# Load the processed data
merged_emp_df_encoded = pd.read_csv(processed_data_path)
print("Processed Data Loaded Successfully.")


Initial Working Directory: /Users/abdurrahman/Documents/Data Bootcamp/Finals/Predicting_Telework_Hours
New Working Directory: /Users/abdurrahman/Documents/Data Bootcamp/Finals/Predicting_Telework_Hours
Processed Data Loaded Successfully.


In [None]:
# Load the scaler
scaler_emp = joblib.load('scaler_emp.pkl')
print("Scaler Loaded Successfully.")

Scaler Loaded Successfully.


In [None]:
# Define the target variable
target_master = 'Avg_Weekly_Hours_Teleworked'
y_master = merged_emp_df_encoded[target_master]

# Define the feature set by excluding the target
X_master = merged_emp_df_encoded.drop([target_master], axis=1)

# Display the shape of features and target
print(f"Features Shape: {X_master.shape}")
print(f"Target Shape: {y_master.shape}")

Features Shape: (74, 87)
Target Shape: (74,)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_master, y_master, test_size=0.2, random_state=42
)

print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Testing Target Shape: {y_test.shape}")

Training Features Shape: (59, 87)
Testing Features Shape: (15, 87)
Training Target Shape: (59,)
Testing Target Shape: (15,)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor Performance:")
print(f"Mean Squared Error: {mse_rf:.4f}")
print(f"R-squared: {r2_rf:.4f}")

Random Forest Regressor Performance:
Mean Squared Error: 0.0146
R-squared: 0.9841


In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores_rf = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')

print(f"Random Forest Cross-Validation R-squared Scores: {cv_scores_rf}")
print(f"Mean CV R-squared: {cv_scores_rf.mean():.4f}")
print(f"Standard Deviation of CV R-squared: {cv_scores_rf.std():.4f}")

Random Forest Cross-Validation R-squared Scores: [0.98938528 0.98866665 0.96165223 0.89102202 0.94960555]
Mean CV R-squared: 0.9561
Standard Deviation of CV R-squared: 0.0360


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid_rf,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='r2'
)

# Perform grid search
grid_search_rf.fit(X_train, y_train)

# Retrieve the best estimator
best_rf = grid_search_rf.best_estimator_

# Display the best parameters
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot

In [None]:
# Predict with the tuned Random Forest model
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate the tuned model
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print("Tuned Random Forest Regressor Performance:")
print(f"Mean Squared Error: {mse_best_rf:.4f}")
print(f"R-squared: {r2_best_rf:.4f}")

Tuned Random Forest Regressor Performance:
Mean Squared Error: 0.0160
R-squared: 0.9825


In [None]:
# Save the tuned Random Forest model
joblib.dump(best_rf, 'tuned_random_forest_telework_model.pkl')

print("Random Forest model saved successfully as 'tuned_random_forest_telework_model.pkl'.")

Random Forest model saved successfully as 'tuned_random_forest_telework_model.pkl'.


In [None]:
# Initialize the XGBoost Regressor
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

# Train the model
xgb_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_reg.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Regressor Performance:")
print(f"Mean Squared Error: {mse_xgb:.4f}")
print(f"R-squared: {r2_xgb:.4f}")

XGBoost Regressor Performance:
Mean Squared Error: 0.0286
R-squared: 0.9688


In [None]:
# Save the tuned XGBoost model
joblib.dump(y_pred_xgb, 'tuned_xgboost_telework_model.pkl')

print("XGBoost model saved successfully as 'tuned_xgboost_telework_model.pkl'.")


XGBoost model saved successfully as 'tuned_xgboost_telework_model.pkl'.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

def to_tensor(data):
    if isinstance(data, (pd.DataFrame, pd.Series)):
        return torch.tensor(data.values, dtype=torch.float32)
    elif isinstance(data, np.ndarray):
        return torch.tensor(data, dtype=torch.float32)
    else:
        raise TypeError("Input data must be a pandas DataFrame/Series or a NumPy ndarray.")

# Convert data to PyTorch tensors
X_train_tensor = to_tensor(X_train)
y_train_tensor = to_tensor(y_train).view(-1, 1)

X_test_tensor = to_tensor(X_test)
y_test_tensor = to_tensor(y_test).view(-1, 1)

# Rest of your code remains the same...


# Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the Neural Network architecture
class TeleworkNet(nn.Module):
    def __init__(self, input_dim):
        super(TeleworkNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(32, 1)  # Output layer

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

# Initialize the model
input_dim = X_train.shape[1]
model = TeleworkNet(input_dim)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 100

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_losses = []

    for batch_X, batch_y in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        epoch_losses.append(loss.item())

    # Calculate average loss for the epoch
    avg_loss = np.mean(epoch_losses)

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Evaluate the model on the test set
model.eval()
test_losses = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        test_losses.append(loss.item())

# Calculate average test loss
avg_test_loss = np.mean(test_losses)
print(f"Test Loss: {avg_test_loss:.4f}")


In [None]:
model.eval()

with torch.no_grad():
    predictions = []
    actuals = []
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        predictions.extend(outputs.numpy())
        actuals.extend(batch_y.numpy())

# Convert lists to NumPy arrays
predictions = np.array(predictions).flatten()
actuals = np.array(actuals).flatten()

# Calculate evaluation metrics
mse_nn = mean_squared_error(actuals, predictions)
r2_nn = r2_score(actuals, predictions)

print("PyTorch Neural Network Regressor Performance:")
print(f"Mean Squared Error: {mse_nn:.4f}")
print(f"R-squared: {r2_nn:.4f}")
