In [2]:
import pandas as pd
import numpy as np

# Setting a random seed for reproducibility
np.random.seed(42)

# Generate sample scheduling data
num_entries = 1000
data = {
    'Task Type': np.random.choice(['Meeting', 'Exercise', 'Deep Work', 'Social', 'Errands'], num_entries),
    'Duration (hrs)': np.random.choice([0.5, 1, 1.5, 2, 3], num_entries),
    'Priority': np.random.choice(['High', 'Medium', 'Low'], num_entries),
    'Available Slot Start': pd.date_range(start="2023-01-01 08:00", periods=num_entries, freq='H'),
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Calculate 'Available Slot End' based on start time and duration
df['Available Slot End'] = df['Available Slot Start'] + pd.to_timedelta(df['Duration (hrs)'], unit='h')

# Simulate 'Slot Suitability' based on duration and a random factor
df['Slot Suitability'] = np.random.choice(['Suitable', 'Too Short', 'Conflicting'], num_entries, p=[0.7, 0.2, 0.1])

# Determine if rescheduling is required ('Yes' for 'Too Short' and 'Conflicting' slots)
df['Rescheduling Required'] = df['Slot Suitability'].apply(lambda x: 'Yes' if x != 'Suitable' else 'No')

# Simulate 'Alternative Slot Suggested' for tasks that need rescheduling
# For simplicity, we'll add 3 hours to the original start time for the alternative slot
df['Alternative Slot Suggested'] = np.where(df['Rescheduling Required'] == 'Yes', df['Available Slot Start'] + pd.Timedelta(hours=3), pd.NaT)

# Arrange columns, moving the outputs to the last two columns
df = df[['Task Type', 'Duration (hrs)', 'Priority', 'Available Slot Start', 'Available Slot End', 'Slot Suitability', 'Rescheduling Required', 'Alternative Slot Suggested']]

# Saving the dataset to a CSV file
dataset_path = 'smart_scheduler_with_outputs.csv'
df.to_csv(dataset_path, index=False)

df.head()

Unnamed: 0,Task Type,Duration (hrs),Priority,Available Slot Start,Available Slot End,Slot Suitability,Rescheduling Required,Alternative Slot Suggested
0,Social,2.0,Low,2023-01-01 08:00:00,2023-01-01 10:00:00,Suitable,No,NaT
1,Errands,2.0,Low,2023-01-01 09:00:00,2023-01-01 11:00:00,Suitable,No,NaT
2,Deep Work,2.0,Medium,2023-01-01 10:00:00,2023-01-01 12:00:00,Suitable,No,NaT
3,Errands,2.0,High,2023-01-01 11:00:00,2023-01-01 13:00:00,Too Short,Yes,1672581600000000000
4,Errands,3.0,Medium,2023-01-01 12:00:00,2023-01-01 15:00:00,Suitable,No,NaT


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv(dataset_path)

# Preprocessing: Convert categorical variables to numeric
label_encoder = LabelEncoder()
df['Task Type'] = label_encoder.fit_transform(df['Task Type'])
df['Priority'] = label_encoder.fit_transform(df['Priority'])
df['Slot Suitability'] = label_encoder.fit_transform(df['Slot Suitability'])

# Prepare features and target
X = df.drop(['Rescheduling Required', 'Alternative Slot Suggested', 'Available Slot Start', 'Available Slot End'], axis=1)
y = label_encoder.fit_transform(df['Rescheduling Required'])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the RandomForestClassifier model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = clf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

Model accuracy: 1.00


In [15]:
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0])

In [12]:
pd.set_option('display.max_rows', df.shape[0]+1)
print(df)

     Task Type  Duration (hrs)  Priority Available Slot Start  \
0            4             2.0         1  2023-01-01 08:00:00   
1            1             2.0         1  2023-01-01 09:00:00   
2            0             2.0         2  2023-01-01 10:00:00   
3            1             2.0         0  2023-01-01 11:00:00   
4            1             3.0         2  2023-01-01 12:00:00   
5            2             3.0         0  2023-01-01 13:00:00   
6            0             1.0         0  2023-01-01 14:00:00   
7            0             0.5         0  2023-01-01 15:00:00   
8            0             2.0         2  2023-01-01 16:00:00   
9            1             1.0         0  2023-01-01 17:00:00   
10           4             0.5         2  2023-01-01 18:00:00   
11           0             1.5         1  2023-01-01 19:00:00   
12           1             3.0         1  2023-01-01 20:00:00   
13           2             0.5         0  2023-01-01 21:00:00   
14           4           

In [13]:
start_times = pd.date_range(start="2023-01-01 08:00", end="2023-01-01 18:00", freq='H').time

# Apply adjustments to 'Alternative Slot Suggested'
df['Alternative Slot Suggested'] = df.apply(lambda row: 
    (pd.Timestamp(row['Available Slot Start']).replace(hour=np.random.choice(start_times).hour, minute=0) + pd.Timedelta(hours=3))
    if row['Rescheduling Required'] == 'Yes' else pd.NaT, axis=1)

# Check for overflow and adjust to the next day if necessary
df['Alternative Slot Suggested'] = df['Alternative Slot Suggested'].apply(
    lambda x: x if pd.isnull(x) or x.hour <= 18 else x + pd.Timedelta(days=1) - pd.Timedelta(hours=x.hour - 8)
)

# Save the adjusted dataset to a new CSV file
adjusted_dataset_path = 'smart_scheduler_adjusted_outputs.csv'
df.to_csv(adjusted_dataset_path, index=False)

df[['Available Slot Start', 'Rescheduling Required', 'Alternative Slot Suggested']].head()

Unnamed: 0,Available Slot Start,Rescheduling Required,Alternative Slot Suggested
0,2023-01-01 08:00:00,No,NaT
1,2023-01-01 09:00:00,No,NaT
2,2023-01-01 10:00:00,No,NaT
3,2023-01-01 11:00:00,Yes,2023-01-01 11:00:00
4,2023-01-01 12:00:00,No,NaT


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset from the CSV file
df = pd.read_csv('slot_suitability_regression_dataset.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Task Type,Duration (hrs),Priority,Available Slot Start,Available Slot End,Slot Score
0,Social,2.0,Low,2023-01-01 08:00:00,2023-01-01 10:00:00,0.60709
1,Errands,2.0,Low,2023-01-01 09:00:00,2023-01-01 11:00:00,0.631308
2,Deep Work,2.0,Medium,2023-01-01 10:00:00,2023-01-01 12:00:00,0.591697
3,Errands,2.0,High,2023-01-01 11:00:00,2023-01-01 13:00:00,0.702634
4,Errands,3.0,Medium,2023-01-01 12:00:00,2023-01-01 15:00:00,0.237433


In [18]:
# Convert categorical variables to numeric using LabelEncoder
label_encoder = LabelEncoder()
df['Task Type'] = label_encoder.fit_transform(df['Task Type'])
df['Priority'] = label_encoder.fit_transform(df['Priority'])

# Convert 'Available Slot Start' and 'Available Slot End' to numeric by extracting hours
df['Slot Start Hour'] = pd.to_datetime(df['Available Slot Start']).dt.hour
df['Slot End Hour'] = pd.to_datetime(df['Available Slot End']).dt.hour

# Prepare the features (X) and the target variable (y)
X = df[['Task Type', 'Duration (hrs)', 'Priority', 'Slot Start Hour', 'Slot End Hour']]
y = df['Slot Score']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
regressor.fit(X_train, y_train)

# Predict the 'Slot Score' on the test set
y_pred = regressor.predict(X_test)

In [21]:
mse = mean_squared_error(y_test, y_pred)
print(f"Model MSE: {mse:.2f}")

Model MSE: 0.10


In [22]:
print(regressor.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [23]:
print(regressor.score(X_test, y_test))

-0.19907295891437804


In [None]:
#Choice 2 (Prefered IMO)
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Load the enhanced dataset
df = pd.read_csv('enhanced_slot_suitability_regression_dataset.csv')

# Prepare features and target variable
X = df.drop(['Slot Score', 'Available Slot Start'], axis=1)  # Dropping 'Available Slot Start' as it's not used directly
y = df['Slot Score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.05, 0.1]
}
grid_search = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions and evaluation
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Enhanced model MSE: {mse:.2f}")


In [26]:
print(best_model.score(X_test, y_test))

-0.04106912527689888
