Model 1: Random Forest

model training

PREPROCESSING

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

def preprocess(df):
    df = df.copy()

    # One-hot encode employee_skills
    mlb_skills = MultiLabelBinarizer()
    skills_encoded = mlb_skills.fit_transform(df['employee_skills'])
    skills_df = pd.DataFrame(skills_encoded, columns=[f"skill_{s}" for s in mlb_skills.classes_])

    # One-hot encode employee_availability (days 0–6)
    mlb_avail = MultiLabelBinarizer()
    avail_encoded = mlb_avail.fit_transform(df['employee_availability'])
    avail_df = pd.DataFrame(avail_encoded, columns=[f"avail_day_{d}" for d in mlb_avail.classes_])

    # One-hot encode task_required_skills
    mlb_task_skills = MultiLabelBinarizer()
    task_skills_encoded = mlb_task_skills.fit_transform(df['task_required_skills'])
    task_skills_df = pd.DataFrame(task_skills_encoded, columns=[f"task_skill_{s}" for s in mlb_task_skills.classes_])

    # Encode task_priority (low, medium, high)
    df['task_priority'] = df['task_priority'].map({'low': 0, 'medium': 1, 'high': 2})

    # Combine everything
    final_df = pd.concat([
        skills_df,
        avail_df,
        task_skills_df,
        df[['task_priority', 'task_duration_days', 'task_start_day', 'rule_violated']]
    ], axis=1)

    return final_df


UPSAMPLE CLASS 1

In [19]:
from sklearn.utils import resample

# Split into majority and minority
df_majority = df[df.assignment_valid == 0]
df_minority = df[df.assignment_valid == 1]

# Upsample the minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,                      # Allow duplicates
    n_samples=len(df_majority),        # Make it same size as majority
    random_state=42
)

# Combine both
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle so they’re mixed well
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)


In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

# Step 1: Load your JSON data
df = pd.read_json(r'C:\Users\ashwi\OneDrive\Desktop\Sycamore\Hackathons\HackToFuture sjec\ML part\scheduling_dataset.json')  # Replace with your path

# Step 2: Preprocessing function
def preprocess(df):
    df = df.copy()

    # Encode skills using MultiLabelBinarizer
    mlb_skills = MultiLabelBinarizer()
    employee_skill_encoded = mlb_skills.fit_transform(df['employee_skills'])
    task_skill_encoded = mlb_skills.transform(df['task_required_skills'])

    # Encode availability (one-hot)
    mlb_avail = MultiLabelBinarizer()
    availability_encoded = mlb_avail.fit_transform(df['employee_availability'])

    # Encode priority using OneHotEncoder
    enc = OneHotEncoder(sparse_output=False)

    priority_encoded = enc.fit_transform(df[['task_priority']])

    # Stack all features into one final array
    features = pd.DataFrame(
        data = pd.concat([
            pd.DataFrame(employee_skill_encoded, columns=mlb_skills.classes_),
            pd.DataFrame(task_skill_encoded, columns=[f"task_{s}" for s in mlb_skills.classes_]),
            pd.DataFrame(availability_encoded, columns=[f"avail_{i}" for i in mlb_avail.classes_]),
            pd.DataFrame(priority_encoded, columns=enc.get_feature_names_out(['task_priority'])),

            df[['task_duration_days', 'task_start_day']]
        ], axis=1)
    )

    return features

# Step 3: Apply preprocessing
X = preprocess(df)
y = df['assignment_valid']

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97        95
           1       0.00      0.00      0.00         5

    accuracy                           0.95       100
   macro avg       0.47      0.50      0.49       100
weighted avg       0.90      0.95      0.93       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Full working version with unsampling

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.utils import resample

# Step 1: Load data
with open(r'C:\Users\ashwi\OneDrive\Desktop\Sycamore\Hackathons\HackToFuture sjec\ML part\scheduling_dataset.json', 'r') as f:
    data = json.load(f)
df = pd.json_normalize(data)

# Step 2: Upsample to balance classes
df_majority = df[df.assignment_valid == 0]
df_minority = df[df.assignment_valid == 1]

df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

df = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1).reset_index(drop=True)

# Step 3: Preprocessing
def preprocess(df):
    mlb_skills = MultiLabelBinarizer()
    mlb_avail = MultiLabelBinarizer()

    employee_skill_encoded = mlb_skills.fit_transform(df['employee_skills'])
    task_skill_encoded = mlb_skills.transform(df['task_required_skills'])
    availability_encoded = mlb_avail.fit_transform(df['employee_availability'])

    enc = OneHotEncoder(sparse_output=False)  # sklearn >= 1.2 uses sparse_output
    priority_encoded = enc.fit_transform(df[['task_priority']])

    features = pd.DataFrame(
        data = pd.concat([
            pd.DataFrame(employee_skill_encoded, columns=mlb_skills.classes_),
            pd.DataFrame(task_skill_encoded, columns=[f"task_{s}" for s in mlb_skills.classes_]),
            pd.DataFrame(availability_encoded, columns=[f"avail_{i}" for i in mlb_avail.classes_]),
            pd.DataFrame(priority_encoded, columns=enc.get_feature_names_out(['task_priority'])),
            df[['task_duration_days', 'task_start_day']]
        ], axis=1)
    )
    return features

X = preprocess(df)
y = df['assignment_valid']

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.99      0.99        94
           1       0.99      1.00      0.99        96

    accuracy                           0.99       190
   macro avg       0.99      0.99      0.99       190
weighted avg       0.99      0.99      0.99       190



Model saving

#You're trying to save the encoders like mlb_skills, mlb_avail, and enc outside the preprocessing function, but they were defined inside it — so they’re not accessible from the outside right now.

Let’s fix that neatly and make sure you can reuse the encoders when loading the model later.

In [None]:
def preprocess(df):
    global mlb_skills, mlb_avail, enc  # <- THIS makes them accessible outside

    mlb_skills = MultiLabelBinarizer()
    employee_skill_encoded = mlb_skills.fit_transform(df['employee_skills'])
    task_skill_encoded = mlb_skills.transform(df['task_required_skills'])

    mlb_avail = MultiLabelBinarizer()
    availability_encoded = mlb_avail.fit_transform(df['employee_availability'])

    enc = OneHotEncoder(sparse_output=False)
    priority_encoded = enc.fit_transform(df[['task_priority']])

    features = pd.DataFrame(
        data = pd.concat([
            pd.DataFrame(employee_skill_encoded, columns=mlb_skills.classes_),
            pd.DataFrame(task_skill_encoded, columns=[f"task_{s}" for s in mlb_skills.classes_]),
            pd.DataFrame(availability_encoded, columns=[f"avail_{i}" for i in mlb_avail.classes_]),
            pd.DataFrame(priority_encoded, columns=enc.get_feature_names_out(['task_priority'])),
            df[['task_duration_days', 'task_start_day']]
        ], axis=1)
    )

    return features


In [24]:
X = preprocess(df)


In [25]:
import pickle

with open('smart_scheduler_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('encoders.pkl', 'wb') as f:
    pickle.dump({
        'mlb_skills': mlb_skills,
        'mlb_avail': mlb_avail,
        'priority_enc': enc
    }, f)


In [3]:
# Improved preprocessing with additional steps
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np

# Assuming df is your dataframe
# 1. Handle class imbalance with upsampling (your existing code)
df_majority = df[df.assignment_valid == 0]
df_minority = df[df.assignment_valid == 1]

# Upsample the minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

# Combine both
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

# 2. Split features and target
X = df_balanced.drop('assignment_valid', axis=1)
y = df_balanced['assignment_valid']

# 3. Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# 4. Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 5. Create train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Build pipeline with preprocessing and optimized Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        class_weight='balanced',
        random_state=42
    ))
])

# 7. Train the model
rf_pipeline.fit(X_train, y_train)

# 8. Evaluate
y_pred = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 9. Get feature importance
feature_names = (
    numeric_features.tolist() + 
    list(rf_pipeline.named_steps['preprocessor']
        .named_transformers_['cat']
        .get_feature_names_out(categorical_features))
)
importances = rf_pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

# Print feature ranking
print("\nFeature ranking:")
for f in range(min(20, len(feature_names))):  # Top 20 features
    print(f"{f+1}. {feature_names[indices[f]]} ({importances[indices[f]]})")

NameError: name 'df' is not defined