In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib

## Load the dataset

In [2]:
data_path = "../data/raw/project_schedule_data.csv"  # Adjust path if necessary
df = pd.read_csv(data_path)

In [3]:
df.head()

Unnamed: 0,Task ID,Team Size,Resource Availability,Complexity,Priority,Risk,Environment,Dependencies,Task Duration (Days),Delay (Days),Start Date,End Date,Actual End Date
0,T0001,7,0.74,3,9,0.5,Arctic,[],42,0,23/01/2024,05/03/2024,05/03/2024
1,T0002,4,0.78,5,5,0.7,Arctic,['T0001'],59,4,05/03/2024,03/05/2024,07/05/2024
2,T0003,15,0.84,1,1,0.1,Desert,"['T0002', 'T0001']",18,0,03/05/2024,21/05/2024,21/05/2024
3,T0004,4,0.82,4,5,0.9,Onshore,"['T0002', 'T0001', 'T0003']",54,4,21/05/2024,14/07/2024,18/07/2024
4,T0005,9,0.97,2,5,0.3,Onshore,"['T0003', 'T0002']",34,0,21/05/2024,24/06/2024,24/06/2024


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 947 entries, 0 to 946
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Task ID                947 non-null    object 
 1   Team Size              947 non-null    int64  
 2   Resource Availability  947 non-null    float64
 3   Complexity             947 non-null    int64  
 4   Priority               947 non-null    int64  
 5   Risk                   947 non-null    float64
 6   Environment            947 non-null    object 
 7   Dependencies           947 non-null    object 
 8   Task Duration (Days)   947 non-null    int64  
 9   Delay (Days)           947 non-null    int64  
 10  Start Date             947 non-null    object 
 11  End Date               947 non-null    object 
 12  Actual End Date        947 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 96.3+ KB


## 1. Data Preprocessing

### 1.1 Apply Label Encoding to Environment

In [10]:
unique_env = df['Environment'].unique()
unique_env

array(['Arctic', 'Desert', 'Onshore', 'Offshore'], dtype=object)

In [11]:
le = LabelEncoder()
df['Environment'] = le.fit_transform(df['Environment'])

In [14]:
for Environment in unique_env:
  print(f"{Environment}: {le.transform([Environment])[0]}")

Arctic: 0
Desert: 1
Onshore: 3
Offshore: 2


### 1.2 Add the number of Dependencies column

In [5]:
df['Number of Dependencies'] = df['Dependencies'].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)

In [6]:
df.head()

Unnamed: 0,Task ID,Team Size,Resource Availability,Complexity,Priority,Risk,Environment,Dependencies,Task Duration (Days),Delay (Days),Start Date,End Date,Actual End Date,Number of Dependencies
0,T0001,7,0.74,3,9,0.5,0,[],42,0,23/01/2024,05/03/2024,05/03/2024,0
1,T0002,4,0.78,5,5,0.7,0,['T0001'],59,4,05/03/2024,03/05/2024,07/05/2024,1
2,T0003,15,0.84,1,1,0.1,1,"['T0002', 'T0001']",18,0,03/05/2024,21/05/2024,21/05/2024,2
3,T0004,4,0.82,4,5,0.9,3,"['T0002', 'T0001', 'T0003']",54,4,21/05/2024,14/07/2024,18/07/2024,3
4,T0005,9,0.97,2,5,0.3,3,"['T0003', 'T0002']",34,0,21/05/2024,24/06/2024,24/06/2024,2


### 1.3 Add the total task duration column

In [7]:
df['Total Task Duration'] = df['Task Duration (Days)'] + df['Delay (Days)']

## 2. Data Preparing

In [8]:
# Drop unnecessary columns for model training
df = df.drop(columns=['Task ID', 'Dependencies', 'Start Date', 'End Date', 'Actual End Date'])

In [9]:
# Features and target
X = df[['Team Size', 'Resource Availability', 'Complexity', 'Priority', 'Risk', 'Environment', 'Number of Dependencies']]
y = df['Total Task Duration']

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 3. Model Training

### Initialize models

In [30]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42,max_depth=30),
    "XGBoost": xgb.XGBRegressor(random_state=42, objective='reg:squarederror')
}

### Train and evaluate each model

In [31]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})

### Create a DataFrame for results

In [32]:
results_df = pd.DataFrame(results)
print("Model Performance Comparison:")
print(results_df)

Model Performance Comparison:
               Model       MAE      RMSE        R2
0  Linear Regression  5.099952  6.375595  0.856657
1      Decision Tree  7.084211  8.632863  0.737189
2      Random Forest  5.694762  6.966910  0.828835
3            XGBoost  5.936469  7.393821  0.807215


n_estimators: [50, 100]

max_depth: [10, 20]

number of combination = 2 * 2 = 4

combination_1 = {n_estimators:50, max_depth:10}

combination_2 = {n_estimators:50, max_depth:20}

combination_3 = {n_estimators:100, max_depth:10}

combination_4 = {n_estimators:100, max_depth:20}

### Hyperparameter Tuning for the best model

In [39]:
best_model_name = results_df.loc[results_df['R2'].idxmax(), 'Model']
if best_model_name == "Random Forest":
    param_grid = {
        "n_estimators": [50, 100, 150],
        "max_depth": [None, 10, 20, 30],
        "min_samples_split": [2, 5, 10]
    }
    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2', verbose=1)

elif best_model_name == "Linear Regression":
    param_grid = {
        "fit_intercept": [True, False],
    }
    grid_search = GridSearchCV(LinearRegression(), param_grid, cv=3, scoring='r2', verbose=1)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best parameters for {best_model_name}: {grid_search.best_params_}")

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
final_mae = mean_absolute_error(y_test, y_pred_best)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_best))
final_r2 = r2_score(y_test, y_pred_best)

print(f"Best {best_model_name} Performance:")
print(f"MAE: {final_mae:.2f}, RMSE: {final_rmse:.2f}, R2: {final_r2:.2f}")

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best parameters for Linear Regression: {'fit_intercept': True}
Best Linear Regression Performance:
MAE: 5.10, RMSE: 6.38, R2: 0.86


In [40]:
# Save the results
results_df.to_csv("../models/model_comparison.csv", index=False)

### Save the best model

In [35]:
model_save_path = "../models/best_model.pkl"  # Define the save path
joblib.dump(best_model, model_save_path)
print(f"Best model saved at {model_save_path}")

Best model saved at ../models/best_model.pkl
