In [1]:
# !pip uninstall scikit-learn imbalanced-learn --quiet
# !pip uninstall scikit-learn --yes

In [2]:
# !pip install scikit-learn

In [3]:
# !pip install imbalanced-learn

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
data = pd.read_excel("project_dataset.xlsx")

In [6]:
data.head(2)

Unnamed: 0,ID,Date,Decription,Project Type,Project Category,Coding_Language,Project Manager,Project Lead,Project Start Date,Research,Planning,Execution,UAT,Launch,No Research,No Planning,No Execution,No UAT,No Launch
0,1,2021-10-06,Flight simulator,SW,Gaming Software,Java,Emily Thompson,Sophia Taylor,2021-11-03,2021-11-16,2021-11-23,2022-03-29,2022-04-12,2022-04-17,13,7,126,14,5
1,2,2019-09-27,Finger Print Voting System,SW,WebDevlopment,Python,Benjamin Hayes,Liam Mitchell,2019-10-04,2019-10-18,2019-10-24,2019-12-29,2020-01-05,2020-01-05,14,6,66,7,0


In [7]:
test_data = data.head(1)
test_data

Unnamed: 0,ID,Date,Decription,Project Type,Project Category,Coding_Language,Project Manager,Project Lead,Project Start Date,Research,Planning,Execution,UAT,Launch,No Research,No Planning,No Execution,No UAT,No Launch
0,1,2021-10-06,Flight simulator,SW,Gaming Software,Java,Emily Thompson,Sophia Taylor,2021-11-03,2021-11-16,2021-11-23,2022-03-29,2022-04-12,2022-04-17,13,7,126,14,5


## Feature Engineering

### Dropping few columns as they might not be required for the Analysis
### Dropped Columns
##### ID
##### Date
##### Project Manager
##### Project Lead

In [8]:
# List of column names to drop
columns_to_drop = ['ID', 'Date']
data.drop(columns=columns_to_drop, inplace=True)
data.head(2)

Unnamed: 0,Decription,Project Type,Project Category,Coding_Language,Project Manager,Project Lead,Project Start Date,Research,Planning,Execution,UAT,Launch,No Research,No Planning,No Execution,No UAT,No Launch
0,Flight simulator,SW,Gaming Software,Java,Emily Thompson,Sophia Taylor,2021-11-03,2021-11-16,2021-11-23,2022-03-29,2022-04-12,2022-04-17,13,7,126,14,5
1,Finger Print Voting System,SW,WebDevlopment,Python,Benjamin Hayes,Liam Mitchell,2019-10-04,2019-10-18,2019-10-24,2019-12-29,2020-01-05,2020-01-05,14,6,66,7,0


In [9]:
# Renaming few columns 

# Dictionary with current column names as keys and new column names as values
new_column_names = {'Decription':'Description','No Research': 'Research Duration', 'No Planning': 'Planning Duration', 'No Execution': 'Execution Duration','No UAT': 'UAT Duration', 'No Launch': 'Launch Duration'}

# Rename columns
data.rename(columns=new_column_names, inplace=True)

In [10]:
data.head(2)

Unnamed: 0,Description,Project Type,Project Category,Coding_Language,Project Manager,Project Lead,Project Start Date,Research,Planning,Execution,UAT,Launch,Research Duration,Planning Duration,Execution Duration,UAT Duration,Launch Duration
0,Flight simulator,SW,Gaming Software,Java,Emily Thompson,Sophia Taylor,2021-11-03,2021-11-16,2021-11-23,2022-03-29,2022-04-12,2022-04-17,13,7,126,14,5
1,Finger Print Voting System,SW,WebDevlopment,Python,Benjamin Hayes,Liam Mitchell,2019-10-04,2019-10-18,2019-10-24,2019-12-29,2020-01-05,2020-01-05,14,6,66,7,0


### Removing all Duration columns and will calculate them after model prediction 

In [11]:
date_columns_to_drop = ['Project Start Date','Research', 'Planning', 'Execution', 'UAT', 'Launch']
data.drop(columns=date_columns_to_drop, inplace=True)
data.head(2)

Unnamed: 0,Description,Project Type,Project Category,Coding_Language,Project Manager,Project Lead,Research Duration,Planning Duration,Execution Duration,UAT Duration,Launch Duration
0,Flight simulator,SW,Gaming Software,Java,Emily Thompson,Sophia Taylor,13,7,126,14,5
1,Finger Print Voting System,SW,WebDevlopment,Python,Benjamin Hayes,Liam Mitchell,14,6,66,7,0


## Data Analysis

In [12]:
data.shape

(150, 11)

In [13]:
data.describe()

Unnamed: 0,Research Duration,Planning Duration,Execution Duration,UAT Duration,Launch Duration
count,150.0,150.0,150.0,150.0,150.0
mean,8.873333,7.106667,92.493333,10.626667,3.166667
std,5.021159,4.310354,43.368833,2.398396,2.327013
min,0.0,0.0,17.0,7.0,0.0
25%,5.0,3.0,54.5,9.0,1.0
50%,9.0,8.0,94.0,10.0,3.0
75%,13.0,11.0,128.0,13.0,5.0
max,18.0,14.0,174.0,14.0,7.0


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Description         150 non-null    object
 1   Project Type        150 non-null    object
 2   Project Category    150 non-null    object
 3   Coding_Language     150 non-null    object
 4   Project Manager     150 non-null    object
 5   Project Lead        150 non-null    object
 6   Research Duration   150 non-null    int64 
 7   Planning Duration   150 non-null    int64 
 8   Execution Duration  150 non-null    int64 
 9   UAT Duration        150 non-null    int64 
 10  Launch Duration     150 non-null    int64 
dtypes: int64(5), object(6)
memory usage: 13.0+ KB


In [15]:
# Checking for any null values
data.isnull().sum()

Description           0
Project Type          0
Project Category      0
Coding_Language       0
Project Manager       0
Project Lead          0
Research Duration     0
Planning Duration     0
Execution Duration    0
UAT Duration          0
Launch Duration       0
dtype: int64

#### We see that somefields are categorical fields and we need to convert these categorical fields of the dataset into numerical variables. I am using pd.get_dummies as ONE HOT ENCODING technique

In [16]:
from sklearn.preprocessing import LabelEncoder

# Apply one-hot encoding to columns with more than two unique values
data = pd.get_dummies(data, columns=['Project Category', 'Coding_Language'])

# Apply label encoding to 'Project Type'
label_encoder = LabelEncoder()
data['Project Type_encoded'] = label_encoder.fit_transform(data['Project Type'])
data.drop('Project Type', axis=1, inplace=True)

# Apply label encoding to 'Project Manager'
data['Project Manager_encoded'] = label_encoder.fit_transform(data['Project Manager'])
data.drop('Project Manager', axis=1, inplace=True)

# Apply label encoding to 'Project Lead'
data['Project Lead_encoded'] = label_encoder.fit_transform(data['Project Lead'])
data.drop('Project Lead', axis=1, inplace=True)

# Apply label encoding to the 'Description' column
data['Description_encoded'] = label_encoder.fit_transform(data['Description'])
data.drop('Description', axis=1, inplace=True)





In [17]:
# # # Use pd.get_dummies for One-Hot Encoding
# data = pd.get_dummies(data, columns=['Description', 'Project Type', 'Project Category', 'Coding_Language', 'Project Manager', 'Project Lead'], drop_first=True)
# # # 'drop_first=True' is used to avoid multicollinearity by dropping one of the dummy variables for each category.

In [18]:
# Checking for any null values
data.isnull().sum()

Research Duration                    0
Planning Duration                    0
Execution Duration                   0
UAT Duration                         0
Launch Duration                      0
Project Category_DTMF                0
Project Category_GPS                 0
Project Category_Gaming Software     0
Project Category_Machine Learning    0
Project Category_Microcontroller     0
Project Category_Phone App           0
Project Category_Security            0
Project Category_WebDevlopment       0
Coding_Language_C                    0
Coding_Language_C#                   0
Coding_Language_C++                  0
Coding_Language_Embedded Systems     0
Coding_Language_Go                   0
Coding_Language_Java                 0
Coding_Language_JavaScript           0
Coding_Language_Python               0
Project Type_encoded                 0
Project Manager_encoded              0
Project Lead_encoded                 0
Description_encoded                  0
dtype: int64

In [19]:
data.head(2)

Unnamed: 0,Research Duration,Planning Duration,Execution Duration,UAT Duration,Launch Duration,Project Category_DTMF,Project Category_GPS,Project Category_Gaming Software,Project Category_Machine Learning,Project Category_Microcontroller,...,Coding_Language_C++,Coding_Language_Embedded Systems,Coding_Language_Go,Coding_Language_Java,Coding_Language_JavaScript,Coding_Language_Python,Project Type_encoded,Project Manager_encoded,Project Lead_encoded,Description_encoded
0,13,7,126,14,5,False,False,True,False,False,...,False,False,False,True,False,False,1,3,19,50
1,14,6,66,7,0,False,False,False,False,False,...,False,False,False,False,False,True,1,1,10,48


In [20]:
data.shape

(150, 25)

In [21]:
X = data.drop(['Research Duration','Planning Duration','Execution Duration','UAT Duration','Launch Duration'], axis=1)
y = data[['Research Duration','Planning Duration', 'Execution Duration', 'UAT Duration', 'Launch Duration']]

In [22]:
# Step 1: Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [23]:
# Select numeric columns for scaling
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns

# Step 2: Scale the numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_val_scaled = scaler.transform(X_val[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])

## Logistic Regression

In [24]:

from sklearn.linear_model import LogisticRegression

# Define logistic regression models for each target variable
logistic_regression_models = {}
for target in y_train.columns:
    logistic_regression = LogisticRegression(max_iter=30)
    logistic_regression.fit(X_train_scaled, y_train[target])
    logistic_regression_models[target] = logistic_regression

# Evaluate models on validation set
for target, model in logistic_regression_models.items():
    y_val_pred = model.predict(X_val_scaled)
    accuracy = accuracy_score(y_val[target], y_val_pred)
    classification_rep = classification_report(y_val[target], y_val_pred)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.0
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       2.0
           3       0.00      0.00      0.00       2.0
           4       0.00      0.00      0.00       1.0
           5       0.00      0.00      0.00       1.0
           6       0.00      0.00      0.00       2.0
           7       0.00      0.00      0.00       2.0
           8       0.00      0.00      0.00       1.0
           9       0.00      0.00      0.00       1.0
          10       0.00      0.00      0.00       3.0
          11       0.00      0.00      0.00       4.0
          13       0.00      0.00      0.00       4.0
          14       0.00      0.00      0.00       3.0
          15       0.00      0.00      0.00       2.0
          16       0.00      0.00      0.00       1.0

    accuracy    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define KNN classifiers for each target variable
knn_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as needed
    knn_classifier.fit(X_train_scaled, y_train_classes)
    knn_classifiers[target] = knn_classifier

# Evaluate models on validation set
for target, model in knn_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.26666666666666666
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.31      0.45      0.37        11
           1       0.14      0.11      0.12         9
           2       0.29      0.20      0.24        10

    accuracy                           0.27        30
   macro avg       0.25      0.26      0.24        30
weighted avg       0.25      0.27      0.25        30

Accuracy for Planning Duration on validation set: 0.26666666666666666
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.15      0.20      0.17        10
           1       0.36      0.33      0.35        12
           2       0.33      0.25      0.29         8

    accuracy                           0.27        30
   macro avg       0.28      0.26      0.27        30
weighted avg       0.29  

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define Decision Tree classifiers for each target variable
dt_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    dt_classifier = DecisionTreeClassifier(random_state=42)
    dt_classifier.fit(X_train_scaled, y_train_classes)
    dt_classifiers[target] = dt_classifier

# Evaluate models on validation set
for target, model in dt_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.3
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.31      0.36      0.33        11
           1       0.22      0.22      0.22         9
           2       0.38      0.30      0.33        10

    accuracy                           0.30        30
   macro avg       0.30      0.30      0.30        30
weighted avg       0.30      0.30      0.30        30

Accuracy for Planning Duration on validation set: 0.23333333333333334
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.14      0.10      0.12        10
           1       0.31      0.42      0.36        12
           2       0.14      0.12      0.13         8

    accuracy                           0.23        30
   macro avg       0.20      0.21      0.20        30
weighted avg       0.21      0.23      0.

In [27]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define SVM classifiers for each target variable
svm_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    svm_classifier = SVC(kernel='rbf', random_state=42)
    svm_classifier.fit(X_train_scaled, y_train_classes)
    svm_classifiers[target] = svm_classifier

# Evaluate models on validation set
for target, model in svm_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.3
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.29      0.55      0.38        11
           1       0.38      0.33      0.35         9
           2       0.00      0.00      0.00        10

    accuracy                           0.30        30
   macro avg       0.22      0.29      0.24        30
weighted avg       0.22      0.30      0.24        30

Accuracy for Planning Duration on validation set: 0.4
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.36      0.40      0.38        10
           1       0.40      0.33      0.36        12
           2       0.44      0.50      0.47         8

    accuracy                           0.40        30
   macro avg       0.40      0.41      0.41        30
weighted avg       0.40      0.40      0.40        30

Ac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define Random Forest classifiers for each target variable
rf_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train_scaled, y_train_classes)
    rf_classifiers[target] = rf_classifier

# Evaluate models on validation set
for target, model in rf_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.23333333333333334
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.14      0.18      0.16        11
           1       0.29      0.22      0.25         9
           2       0.33      0.30      0.32        10

    accuracy                           0.23        30
   macro avg       0.25      0.23      0.24        30
weighted avg       0.25      0.23      0.24        30

Accuracy for Planning Duration on validation set: 0.26666666666666666
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.17      0.10      0.12        10
           1       0.33      0.42      0.37        12
           2       0.22      0.25      0.24         8

    accuracy                           0.27        30
   macro avg       0.24      0.26      0.24        30
weighted avg       0.25  

In [29]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define AdaBoost classifiers for each target variable
adaboost_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    adaboost_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)
    adaboost_classifier.fit(X_train_scaled, y_train_classes)
    adaboost_classifiers[target] = adaboost_classifier

# Evaluate models on validation set
for target, model in adaboost_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)




Accuracy for Research Duration on validation set: 0.23333333333333334
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.25      0.27      0.26        11
           1       0.00      0.00      0.00         9
           2       0.27      0.40      0.32        10

    accuracy                           0.23        30
   macro avg       0.17      0.22      0.19        30
weighted avg       0.18      0.23      0.20        30

Accuracy for Planning Duration on validation set: 0.26666666666666666
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.20      0.20      0.20        10
           1       0.30      0.25      0.27        12
           2       0.30      0.38      0.33         8

    accuracy                           0.27        30
   macro avg       0.27      0.27      0.27        30
weighted avg       0.27  

In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define Gradient Boosting classifiers for each target variable
gradient_boost_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    gradient_boost_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gradient_boost_classifier.fit(X_train_scaled, y_train_classes)
    gradient_boost_classifiers[target] = gradient_boost_classifier

# Evaluate models on validation set
for target, model in gradient_boost_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.3333333333333333
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.25      0.27      0.26        11
           1       0.44      0.44      0.44         9
           2       0.33      0.30      0.32        10

    accuracy                           0.33        30
   macro avg       0.34      0.34      0.34        30
weighted avg       0.34      0.33      0.33        30

Accuracy for Planning Duration on validation set: 0.26666666666666666
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.17      0.10      0.12        10
           1       0.33      0.42      0.37        12
           2       0.22      0.25      0.24         8

    accuracy                           0.27        30
   macro avg       0.24      0.26      0.24        30
weighted avg       0.25   

In [31]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Define XGBoost classifiers for each target variable
xgb_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    xgb_classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)
    xgb_classifier.fit(X_train_scaled, y_train_classes)
    xgb_classifiers[target] = xgb_classifier

# Evaluate models on validation set
for target, model in xgb_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.26666666666666666
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.23      0.27      0.25        11
           1       0.22      0.22      0.22         9
           2       0.38      0.30      0.33        10

    accuracy                           0.27        30
   macro avg       0.28      0.26      0.27        30
weighted avg       0.28      0.27      0.27        30

Accuracy for Planning Duration on validation set: 0.3
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.17      0.10      0.12        10
           1       0.38      0.50      0.43        12
           2       0.25      0.25      0.25         8

    accuracy                           0.30        30
   macro avg       0.26      0.28      0.27        30
weighted avg       0.27      0.30      0.

In [32]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define CatBoost classifiers for each target variable
catboost_classifiers = {}
for target in y_train.columns:
    # Discretize the target variable into classes (you may need to adjust the bins)
    y_train_classes = pd.qcut(y_train[target], q=3, labels=[0, 1, 2])
    
    catboost_classifier = CatBoostClassifier(n_estimators=100, random_state=42, verbose=False)
    catboost_classifier.fit(X_train_scaled, y_train_classes)
    catboost_classifiers[target] = catboost_classifier

# Evaluate models on validation set
for target, model in catboost_classifiers.items():
    # Discretize the target variable for the validation set
    y_val_classes = pd.qcut(y_val[target], q=3, labels=[0, 1, 2])
    
    y_val_pred_classes = model.predict(X_val_scaled)
    
    # Compute accuracy
    accuracy = accuracy_score(y_val_classes, y_val_pred_classes)
    
    # Compute classification report
    classification_rep = classification_report(y_val_classes, y_val_pred_classes)

    print(f"Accuracy for {target} on validation set:", accuracy)
    print(f"Classification report for {target} on validation set:\n", classification_rep)


Accuracy for Research Duration on validation set: 0.3
Classification report for Research Duration on validation set:
               precision    recall  f1-score   support

           0       0.21      0.27      0.24        11
           1       0.29      0.22      0.25         9
           2       0.44      0.40      0.42        10

    accuracy                           0.30        30
   macro avg       0.31      0.30      0.30        30
weighted avg       0.31      0.30      0.30        30

Accuracy for Planning Duration on validation set: 0.3
Classification report for Planning Duration on validation set:
               precision    recall  f1-score   support

           0       0.11      0.10      0.11        10
           1       0.38      0.42      0.40        12
           2       0.38      0.38      0.38         8

    accuracy                           0.30        30
   macro avg       0.29      0.30      0.29        30
weighted avg       0.29      0.30      0.30        30

Ac

In [33]:
# test_data

In [34]:
# # List of column names to drop
# test_data_columns_to_drop = ['ID', 'Date']
# test_data.drop(columns=test_data_columns_to_drop, inplace=True)
# test_data.head(2)


In [35]:
# # Renaming few columns 

# # Dictionary with current column names as keys and new column names as values
# test_data_new_column_names = {'Decription':'Description','No Research': 'Research Duration', 'No Planning': 'Planning Duration', 'No Execution': 'Execution Duration','No UAT': 'UAT Duration', 'No Launch': 'Launch Duration'}

# # Rename columns
# test_data.rename(columns=test_data_new_column_names, inplace=True)
# test_data

In [36]:
# test_data_date_columns_to_drop = ['Project Start Date','Research', 'Planning', 'Execution', 'UAT', 'Launch']
# test_data.drop(columns=test_data_date_columns_to_drop, inplace=True)
# test_data.head(2)

In [37]:
# from sklearn.preprocessing import LabelEncoder

# # Apply one-hot encoding to columns with more than two unique values
# test_data = pd.get_dummies(test_data, columns=['Project Category', 'Coding_Language'])

# # Apply label encoding to 'Project Type'
# label_encoder = LabelEncoder()
# test_data['Project Type_encoded'] = label_encoder.fit_transform(test_data['Project Type'])
# test_data.drop('Project Type', axis=1, inplace=True)

# # Apply label encoding to 'Project Manager'
# test_data['Project Manager_encoded'] = label_encoder.fit_transform(test_data['Project Manager'])
# test_data.drop('Project Manager', axis=1, inplace=True)

# # Apply label encoding to 'Project Lead'
# test_data['Project Lead_encoded'] = label_encoder.fit_transform(test_data['Project Lead'])
# test_data.drop('Project Lead', axis=1, inplace=True)

# # Apply label encoding to the 'Description' column
# test_data['Description_encoded'] = label_encoder.fit_transform(test_data['Description'])
# test_data.drop('Description', axis=1, inplace=True)

In [38]:
# # Scale the sample test data
# sample_test_scaled = scaler.transform(test_data)

In [39]:
data.shape

(150, 25)

In [40]:
X.head(1)

Unnamed: 0,Project Category_DTMF,Project Category_GPS,Project Category_Gaming Software,Project Category_Machine Learning,Project Category_Microcontroller,Project Category_Phone App,Project Category_Security,Project Category_WebDevlopment,Coding_Language_C,Coding_Language_C#,Coding_Language_C++,Coding_Language_Embedded Systems,Coding_Language_Go,Coding_Language_Java,Coding_Language_JavaScript,Coding_Language_Python,Project Type_encoded,Project Manager_encoded,Project Lead_encoded,Description_encoded
0,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,1,3,19,50


In [41]:
y.head(1)

Unnamed: 0,Research Duration,Planning Duration,Execution Duration,UAT Duration,Launch Duration
0,13,7,126,14,5


In [42]:
import pandas as pd

# Create sample test data
sample_test_data = {
    'Project Category_DTMF': [False],
    'Project Category_GPS': [False],
    'Project Category_Gaming Software': [True],
    'Project Category_Machine Learning': [False],
    'Project Category_Microcontroller': [False],
    'Project Category_Phone App': [False],
    'Project Category_Security': [False],
    'Project Category_WebDevlopment': [False],
    'Coding_Language_C': [False],
    'Coding_Language_C#': [False],
    'Coding_Language_C++': [False],
    'Coding_Language_Embedded Systems': [False],
    'Coding_Language_Go': [False],
    'Coding_Language_Java': [False],
    'Coding_Language_JavaScript': [True],
    'Coding_Language_Python': [False],
    'Project Type_encoded': [1],
    'Project Manager_encoded': [3],
    'Project Lead_encoded': [19],
    'Description_encoded': [50]
}

# Convert sample test data to DataFrame
sample_test_df = pd.DataFrame(sample_test_data)

# Use the trained CatBoost classifiers to predict y features
predicted_y = {}
for target, model in catboost_classifiers.items():
    # Predict classes using the CatBoost model
    y_pred_classes = model.predict(sample_test_df)
    
    # Store the predicted classes
    predicted_y[target] = y_pred_classes

# Display the predicted y features
for target, y_pred_classes in predicted_y.items():
    print(f"Predicted classes for {target}: {y_pred_classes}")


Predicted classes for Research Duration: [[2]]
Predicted classes for Planning Duration: [[1]]
Predicted classes for Execution Duration: [[2]]
Predicted classes for UAT Duration: [[1]]
Predicted classes for Launch Duration: [[1]]


In [43]:
import pandas as pd

# Sample test data
sample_test_data = {
    'ID': [1],
    'Date': ['06-Oct-21'],
    'Decription': ['GPS Monitor'],
    'Project Type': ['HW'],
    'Project Category': ['GPS'],
    'Coding_Language': ['Embedded Systems'],
    'Project Manager': ['Emily Thompson'],
    'Project Lead': ['Sophia Taylor'],
    'Project Start Date': ['03-Nov-21'],
    'Research': [16],
    'Planning': [23],
    'Execution': [29],
    'UAT': [12],
    'Launch': [17],
    'No Research': [13],
    'No Planning': [7],
    'No Execution': [126],
    'No UAT': [14],
    'No Launch': [5]
}

# Convert sample test data to DataFrame
sample_test_df = pd.DataFrame(sample_test_data)

# Remove unwanted columns
columns_to_drop = ['ID', 'Date', 'Project Start Date', 'Research', 'Planning', 'Execution', 'UAT', 'Launch']
sample_test_df.drop(columns=columns_to_drop, inplace=True)

# Rename columns
new_column_names = {'Decription': 'Description', 'No Research': 'Research Duration', 'No Planning': 'Planning Duration',
                    'No Execution': 'Execution Duration', 'No UAT': 'UAT Duration', 'No Launch': 'Launch Duration'}
sample_test_df.rename(columns=new_column_names, inplace=True)

# One-hot encode categorical columns
all_categories = {
    'Project Category': ['DTMF', 'GPS', 'Gaming Software', 'Machine Learning', 'Microcontroller', 'Phone App', 'Security', 'WebDevlopment'],
    'Coding_Language': ['C', 'C#', 'C++', 'Embedded Systems', 'Go', 'Java', 'JavaScript', 'Python']
}
for feature, categories in all_categories.items():
    sample_test_df[feature] = pd.Categorical(sample_test_df[feature], categories=categories)
sample_test_df_encoded = pd.get_dummies(sample_test_df, columns=all_categories.keys())

# Ensure that the sample test data has the same columns as the training data after encoding
missing_columns = set(X_train.columns) - set(sample_test_df_encoded.columns)
for column in missing_columns:
    sample_test_df_encoded[column] = 0

# Reorder columns to match the order of columns in the training data
sample_test_df_encoded = sample_test_df_encoded[X_train.columns]

# Use the trained CatBoost classifiers to predict y features
predicted_y = {}
for target, model in catboost_classifiers.items():
    # Predict classes using the CatBoost model
    y_pred_classes = model.predict(sample_test_df_encoded)
    
    # Store the predicted classes
    predicted_y[target] = y_pred_classes

# Display the predicted y features
for target, y_pred_classes in predicted_y.items():
    print(f"Predicted classes for {target}: {y_pred_classes}")


Predicted classes for Research Duration: [[2]]
Predicted classes for Planning Duration: [[2]]
Predicted classes for Execution Duration: [[0]]
Predicted classes for UAT Duration: [[2]]
Predicted classes for Launch Duration: [[0]]
