In [2]:
!pip install datasets transformers seaborn plotly
!pip install transformers dataset
from huggingface_hub import notebook_login
notebook_login()

# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

# import function from datasets library for accessing and downloading data
# sets from HuggingFace Hub
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/xhlin0601/events-scheduling/" + splits["train"])




# Accesses a specific split of loaded dataset - in this case the train splite
# Converts into pandas DataFrame - convienent data structure for manipulation
# and analysis
#df = dataset["train"].to_pandas()

# prints first five rows of newly created Data Frame - quick way to inspect
# data loaded correctly
df.head()

# prints summary
df.info()

# Descriptive statistics of numerical columns
df.describe()

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting plotly
  Downloading plotly-6.2.0-py3-none-any.whl.metadata (8.5 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloadin

  from .autonotebook import tqdm as notebook_tqdm


ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.

In [None]:
import pandas as pd
import ast
from datetime import timedelta
import numpy as np

# Step 1: Load dataset
df = pd.read_parquet("hf://datasets/xhlin0601/events-scheduling/data/train-00000-of-00001.parquet")

# Step 2: Preprocess event lists
def parse_events(event_data):
    """Convert numpy array of numpy arrays into a Python list of lists."""
    if isinstance(event_data, np.ndarray):
        # Convert outer numpy array to a list, and each inner numpy array to a list
        return [list(item) for item in event_data]
    elif isinstance(event_data, str):
        # Fallback for string format
        try:
            return ast.literal_eval(event_data)
        except (ValueError, SyntaxError):
            print(f"Could not parse string data: {event_data}")
            return [] # Return empty list or handle error as appropriate
    else:
        # Handle other potential types if necessary, or return empty list
        print(f"Unexpected data type: {type(event_data)}")
        return []

# Apply the parsing function
df['parsed_events'] = df['events'].apply(lambda x: parse_events(x))
df['parsed_priorities'] = df['priority_events'].apply(lambda x: parse_events(x))

# Helper: convert time strings to minutes
def time_to_minutes(time_str):
    if isinstance(time_str, str):
        try:
            h, m = map(int, time_str.split(':'))
            return h * 60 + m
        except ValueError:
            print(f"Could not parse time string: {time_str}")
            return None # Handle cases where time string is invalid
    return None # Handle non-string inputs


# Step 3: Calculate requested features
num_events = []
avg_durations = []
num_priority = []
day_spans = []

for index, row in df.iterrows():
    events = row['parsed_events']
    priority_events = row['parsed_priorities']

    # Number of events
    num_e = len(events) if isinstance(events, list) else 0
    num_events.append(num_e)

    # Number of priority events
    num_p = len(priority_events) if isinstance(priority_events, list) else 0
    num_priority.append(num_p)

    # Calculate durations, start times, and end times
    durations = []
    start_times = []
    end_times = []

    if isinstance(events, list):
        for event in events:
            if isinstance(event, list) and len(event) >= 3: # Ensure event has name, start, and end
                start_time_str = event[1]
                end_time_str = event[2]

                start_min = time_to_minutes(start_time_str)
                end_min = time_to_minutes(end_time_str)

                if start_min is not None and end_min is not None:
                    # Handle cases where end time might be on the next day (e.g., 23:00 -> 01:00)
                    if end_min < start_min:
                        duration = (24 * 60 - start_min) + end_min
                    else:
                        duration = end_min - start_min
                    durations.append(duration)
                    start_times.append(start_min)
                    end_times.append(end_min)


    # Average duration
    avg_dur = np.mean(durations) if durations else 0
    avg_durations.append(avg_dur)

    # Day span
    day_span = 0
    if start_times and end_times:
        min_start = min(start_times)
        max_end = max(end_times)
        # Consider day span across midnight if necessary - assuming events are within a 24h period for simplicity here
        # A more robust solution might need to handle dates
        day_span = max_end - min_start


    day_spans.append(day_span)


# Step 4: Create a new DataFrame with the calculated features and optimal score
df_features = pd.DataFrame({
    'num_events': num_events,
    'num_priority': num_priority,
    'avg_durations': avg_durations,
    'day_spans': day_spans,
    "optimal_score": df["optimal_score"] # Include the optimal score
})

# Step 5: Display the DataFrame with features for all rows
display(df_features)

Unnamed: 0,num_events,num_priority,avg_durations,day_spans,optimal_score
0,4,1,82.500000,1176,375
1,4,1,45.000000,1128,195
2,7,1,81.428571,1103,555
3,5,1,84.000000,545,360
4,8,1,71.250000,1341,435
...,...,...,...,...,...
495,7,1,55.714286,1081,405
496,4,1,37.500000,929,150
497,4,1,60.000000,691,285
498,8,2,58.125000,1253,585


# Task
In a new cell, train a scikit-learn model to learn the relationship between the input features and the 'optimal_score' in the `df_features` DataFrame.

## Prepare the data

### Subtask:
Separate the features (X) and the target variable (y) from the `df_features` DataFrame.


**Reasoning**:
Separate the features (X) and the target variable (y) from the df_features DataFrame.



In [None]:
X = df_features.drop('optimal_score', axis=1)
y = df_features['optimal_score']

**Reasoning**:
The DataFrame `df_features` was not defined in the current session. The previous code cell created `df_features`, so re-executing the code from that cell will define it. Then the features (X) and target variable (y) can be separated.



In [None]:
# Step 1: Load dataset
df = pd.read_parquet("hf://datasets/xhlin0601/events-scheduling/data/train-00000-of-00001.parquet")

# Step 2: Preprocess event lists
def parse_events(event_data):
    """Convert numpy array of numpy arrays into a Python list of lists."""
    if isinstance(event_data, np.ndarray):
        # Convert outer numpy array to a list, and each inner numpy array to a list
        return [list(item) for item in event_data]
    elif isinstance(event_data, str):
        # Fallback for string format
        try:
            return ast.literal_eval(event_data)
        except (ValueError, SyntaxError):
            print(f"Could not parse string data: {event_data}")
            return [] # Return empty list or handle error as appropriate
    else:
        # Handle other potential types if necessary, or return empty list
        print(f"Unexpected data type: {type(event_data)}")
        return []

# Apply the parsing function
df['parsed_events'] = df['events'].apply(lambda x: parse_events(x))
df['parsed_priorities'] = df['priority_events'].apply(lambda x: parse_events(x))

# Helper: convert time strings to minutes
def time_to_minutes(time_str):
    if isinstance(time_str, str):
        try:
            h, m = map(int, time_str.split(':'))
            return h * 60 + m
        except ValueError:
            print(f"Could not parse time string: {time_str}")
            return None # Handle cases where time string is invalid
    return None # Handle non-string inputs


# Step 3: Calculate requested features
num_events = []
avg_durations = []
num_priority = []
day_spans = []

for index, row in df.iterrows():
    events = row['parsed_events']
    priority_events = row['parsed_priorities']

    # Number of events
    num_e = len(events) if isinstance(events, list) else 0
    num_events.append(num_e)

    # Number of priority events
    num_p = len(priority_events) if isinstance(priority_events, list) else 0
    num_priority.append(num_p)

    # Calculate durations, start times, and end times
    durations = []
    start_times = []
    end_times = []

    if isinstance(events, list):
        for event in events:
            if isinstance(event, list) and len(event) >= 3: # Ensure event has name, start, and end
                start_time_str = event[1]
                end_time_str = event[2]

                start_min = time_to_minutes(start_time_str)
                end_min = time_to_minutes(end_time_str)

                if start_min is not None and end_min is not None:
                    # Handle cases where end time might be on the next day (e.g., 23:00 -> 01:00)
                    if end_min < start_min:
                        duration = (24 * 60 - start_min) + end_min
                    else:
                        duration = end_min - start_min
                    durations.append(duration)
                    start_times.append(start_min)
                    end_times.append(end_min)


    # Average duration
    avg_dur = np.mean(durations) if durations else 0
    avg_durations.append(avg_dur)

    # Day span
    day_span = 0
    if start_times and end_times:
        min_start = min(start_times)
        max_end = max(end_times)
        # Consider day span across midnight if necessary - assuming events are within a 24h period for simplicity here
        # A more robust solution might need to handle dates
        day_span = max_end - min_start


    day_spans.append(day_span)


# Step 4: Create a new DataFrame with the calculated features and optimal score
df_features = pd.DataFrame({
    'num_events': num_events,
    'num_priority': num_priority,
    'avg_durations': avg_durations,
    'day_spans': day_spans,
    "optimal_score": df["optimal_score"] # Include the optimal score
})

# Step 5: Separate features (X) and target variable (y)
X = df_features.drop('optimal_score', axis=1)
y = df_features['optimal_score']

# Display the first few rows of X and y to verify
display(X.head())
display(y.head())

Unnamed: 0,num_events,num_priority,avg_durations,day_spans
0,4,1,82.5,1176
1,4,1,45.0,1128
2,7,1,81.428571,1103
3,5,1,84.0,545
4,8,1,71.25,1341


Unnamed: 0,optimal_score
0,375
1,195
2,555
3,360
4,435


**Reasoning**:
The previous code block failed because the pandas library was not imported. The code from the previous cell, including the necessary imports, needs to be re-executed to define `df_features` and then separate X and y.



In [None]:
import pandas as pd
import ast
from datetime import timedelta
import numpy as np

# Step 1: Load dataset
df = pd.read_parquet("hf://datasets/xhlin0601/events-scheduling/data/train-00000-of-00001.parquet")

# Step 2: Preprocess event lists
def parse_events(event_data):
    """Convert numpy array of numpy arrays into a Python list of lists."""
    if isinstance(event_data, np.ndarray):
        # Convert outer numpy array to a list, and each inner numpy array to a list
        return [list(item) for item in event_data]
    elif isinstance(event_data, str):
        # Fallback for string format
        try:
            return ast.literal_eval(event_data)
        except (ValueError, SyntaxError):
            print(f"Could not parse string data: {event_data}")
            return [] # Return empty list or handle error as appropriate
    else:
        # Handle other potential types if necessary, or return empty list
        print(f"Unexpected data type: {type(event_data)}")
        return []

# Apply the parsing function
df['parsed_events'] = df['events'].apply(lambda x: parse_events(x))
df['parsed_priorities'] = df['priority_events'].apply(lambda x: parse_events(x))

# Helper: convert time strings to minutes
def time_to_minutes(time_str):
    if isinstance(time_str, str):
        try:
            h, m = map(int, time_str.split(':'))
            return h * 60 + m
        except ValueError:
            print(f"Could not parse time string: {time_str}")
            return None # Handle cases where time string is invalid
    return None # Handle non-string inputs


# Step 3: Calculate requested features
num_events = []
avg_durations = []
num_priority = []
day_spans = []

for index, row in df.iterrows():
    events = row['parsed_events']
    priority_events = row['parsed_priorities']

    # Number of events
    num_e = len(events) if isinstance(events, list) else 0
    num_events.append(num_e)

    # Number of priority events
    num_p = len(priority_events) if isinstance(priority_events, list) else 0
    num_priority.append(num_p)

    # Calculate durations, start times, and end times
    durations = []
    start_times = []
    end_times = []

    if isinstance(events, list):
        for event in events:
            if isinstance(event, list) and len(event) >= 3: # Ensure event has name, start, and end
                start_time_str = event[1]
                end_time_str = event[2]

                start_min = time_to_minutes(start_time_str)
                end_min = time_to_minutes(end_time_str)

                if start_min is not None and end_min is not None:
                    # Handle cases where end time might be on the next day (e.g., 23:00 -> 01:00)
                    if end_min < start_min:
                        duration = (24 * 60 - start_min) + end_min
                    else:
                        duration = end_min - start_min
                    durations.append(duration)
                    start_times.append(start_min)
                    end_times.append(end_min)


    # Average duration
    avg_dur = np.mean(durations) if durations else 0
    avg_durations.append(avg_dur)

    # Day span
    day_span = 0
    if start_times and end_times:
        min_start = min(start_times)
        max_end = max(end_times)
        # Consider day span across midnight if necessary - assuming events are within a 24h period for simplicity here
        # A more robust solution might need to handle dates
        day_span = max_end - min_start


    day_spans.append(day_span)


# Step 4: Create a new DataFrame with the calculated features and optimal score
df_features = pd.DataFrame({
    'num_events': num_events,
    'num_priority': num_priority,
    'avg_durations': avg_durations,
    'day_spans': day_spans,
    "optimal_score": df["optimal_score"] # Include the optimal score
})

# Step 5: Separate features (X) and target variable (y)
X = df_features.drop('optimal_score', axis=1)
y = df_features['optimal_score']

# Display the first few rows of X and y to verify
display(X.head())
display(y.head())

Unnamed: 0,num_events,num_priority,avg_durations,day_spans
0,4,1,82.5,1176
1,4,1,45.0,1128
2,7,1,81.428571,1103
3,5,1,84.0,545
4,8,1,71.25,1341


Unnamed: 0,optimal_score
0,375
1,195
2,555
3,360
4,435


## Split the data

### Subtask:
Divide the data into training and testing sets to evaluate the model's performance on unseen data.


**Reasoning**:
Divide the features and target into training and testing sets using train_test_split and print their shapes.



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (400, 4)
Shape of X_test: (100, 4)
Shape of y_train: (400,)
Shape of y_test: (100,)


## Choose and train a model

### Subtask:
Select a suitable regression model from scikit-learn (e.g., Linear Regression, RandomForestRegressor) and train it on the training data.


**Reasoning**:
Import LinearRegression and train the model on the training data.



In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

## Evaluate the model

### Subtask:
Assess the trained model's performance using appropriate regression metrics (e.g., Mean Absolute Error, Mean Squared Error, R-squared) on the testing data.


**Reasoning**:
Calculate and print the MAE, MSE, and R-squared metrics using the predicted and actual optimal scores on the test set.



In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Absolute Error (MAE): 45.67
Mean Squared Error (MSE): 3127.44
R-squared (R2) Score: 0.78


## Summary:

### Data Analysis Key Findings

*   The dataset was successfully split into training (80%, 400 samples) and testing (20%, 100 samples) sets.
*   A Linear Regression model was chosen and trained on the training data.
*   The model's performance on the testing data resulted in a Mean Absolute Error (MAE) of 45.67, a Mean Squared Error (MSE) of 3127.44, and an R-squared (\(R^2\)) score of 0.78.

### Insights or Next Steps

*   An \(R^2\) score of 0.78 indicates that approximately 78% of the variance in the 'optimal\_score' can be explained by the features included in the model. While this is a reasonably good starting point, further model tuning or exploring other regression algorithms might improve performance.
*   Investigating the features' coefficients in the Linear Regression model could provide insights into which features have the most significant impact on the 'optimal\_score'.


## Train a RandomForestRegressor

### Subtask:
Train a RandomForestRegressor model on the training data.

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the RandomForestRegressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
import pandas as pd
import ast
from datetime import timedelta
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Step 1: Load dataset
df = pd.read_parquet("hf://datasets/xhlin0601/events-scheduling/data/train-00000-of-00001.parquet")

# Step 2: Preprocess event lists
def parse_events(event_data):
    """Convert numpy array of numpy arrays into a Python list of lists."""
    if isinstance(event_data, np.ndarray):
        # Convert outer numpy array to a list, and each inner numpy array to a list
        return [list(item) for item in event_data]
    elif isinstance(event_data, str):
        # Fallback for string format
        try:
            return ast.literal_eval(event_data)
        except (ValueError, SyntaxError):
            print(f"Could not parse string data: {event_data}")
            return [] # Return empty list or handle error as appropriate
    else:
        # Handle other potential types if necessary, or return empty list
        print(f"Unexpected data type: {type(event_data)}")
        return []

# Apply the parsing function
df['parsed_events'] = df['events'].apply(lambda x: parse_events(x))
df['parsed_priorities'] = df['priority_events'].apply(lambda x: parse_events(x))

# Helper: convert time strings to minutes
def time_to_minutes(time_str):
    if isinstance(time_str, str):
        try:
            h, m = map(int, time_str.split(':'))
            return h * 60 + m
        except ValueError:
            print(f"Could not parse time string: {time_str}")
            return None # Handle cases where time string is invalid
    return None # Handle non-string inputs


# Step 3: Calculate requested features
num_events = []
avg_durations = []
num_priority = []
day_spans = []

for index, row in df.iterrows():
    events = row['parsed_events']
    priority_events = row['parsed_priorities']

    # Number of events
    num_e = len(events) if isinstance(events, list) else 0
    num_events.append(num_e)

    # Number of priority events
    num_p = len(priority_events) if isinstance(priority_events, list) else 0
    num_priority.append(num_p)

    # Calculate durations, start times, and end times
    durations = []
    start_times = []
    end_times = []

    if isinstance(events, list):
        for event in events:
            if isinstance(event, list) and len(event) >= 3: # Ensure event has name, start, and end
                start_time_str = event[1]
                end_time_str = event[2]

                start_min = time_to_minutes(start_time_str)
                end_min = time_to_minutes(end_time_str)

                if start_min is not None and end_min is not None:
                    # Handle cases where end time might be on the next day (e.g., 23:00 -> 01:00)
                    if end_min < start_min:
                        duration = (24 * 60 - start_min) + end_min
                    else:
                        duration = end_min - start_min
                    durations.append(duration)
                    start_times.append(start_min)
                    end_times.append(end_min)


    # Average duration
    avg_dur = np.mean(durations) if durations else 0
    avg_durations.append(avg_dur)

    # Day span
    day_span = 0
    if start_times and end_times:
        min_start = min(start_times)
        max_end = max(end_times)
        # Consider day span across midnight if necessary - assuming events are within a 24h period for simplicity here
        # A more robust solution might need to handle dates
        day_span = max_end - min_start


    day_spans.append(day_span)


# Step 4: Create a new DataFrame with the calculated features and optimal score
df_features = pd.DataFrame({
    'num_events': num_events,
    'num_priority': num_priority,
    'avg_durations': avg_durations,
    'day_spans': day_spans,
    "optimal_score": df["optimal_score"] # Include the optimal score
})

# Step 5: Separate features (X) and target variable (y)
X = df_features.drop('optimal_score', axis=1)
y = df_features['optimal_score']

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Initialize and train the RandomForestRegressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

display(X_train.head())
display(y_train.head())

Unnamed: 0,num_events,num_priority,avg_durations,day_spans
249,6,1,102.5,1110
433,8,3,60.0,814
19,5,2,81.0,1275
322,5,2,57.0,812
332,8,1,63.75,656


Unnamed: 0,optimal_score
249,525
433,555
19,495
322,435
332,390


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test)

# Calculate evaluation metrics for RandomForestRegressor
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print the evaluation metrics
print(f"Random Forest Regressor - Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"Random Forest Regressor - Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Random Forest Regressor - R-squared (R2) Score: {r2_rf:.2f}")

Random Forest Regressor - Mean Absolute Error (MAE): 53.09
Random Forest Regressor - Mean Squared Error (MSE): 4149.51
Random Forest Regressor - R-squared (R2) Score: 0.71


In [None]:
import joblib
joblib.dump(model, 'schedule_model.pk1')

NameError: name 'model' is not defined