# Task Reordering Model Training
## XGBoost-based Learning to Rank Model

This notebook trains a pointwise learning-to-rank model to predict task completion order within sprints.

## 1. Import Libraries

In [1]:
!pip install pandas numpy scikit-learn xgboost joblib

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.1


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Load Data

In [28]:
# Load all required datasets
print("Loading datasets...")

issue_df = pd.read_csv('Dataset/issue.csv')
sprint_df = pd.read_csv('Dataset/Sprint.csv')
comment_df = pd.read_csv('Dataset/Comment.csv')
issue_links_df = pd.read_csv('Dataset/Issue_Links.csv')
change_log_df = pd.read_csv('Dataset/Change_Log.csv')
# issue_components_df = pd.read_csv('Dataset/Issue_Components.csv')

print("\nColumns in sprint_df:")
print(sprint_df.columns.tolist())
print("\nColumns in issue_df:")
print(issue_df.columns.tolist())

print(f"\nIssues: {len(issue_df)}")
print(f"Sprints: {len(sprint_df)}")
print(f"Comments: {len(comment_df)}")
print(f"Issue Links: {len(issue_links_df)}")
print(f"Change Logs: {len(change_log_df)}")
# print(f"Issue Components: {len(issue_components_df)}")

Loading datasets...

Columns in sprint_df:
['Sprint_ID', 'Jira_ID', 'Name', 'State', 'Start_Date', 'End_Date', 'Activated_Date', 'Complete_Date', 'Project_ID']

Columns in issue_df:
['ID', 'Jira_ID', 'Issue_Key', 'URL', 'Title', 'Description', 'Description_Text', 'Description_Code', 'Type', 'Priority', 'Status', 'Resolution', 'Creation_Date', 'Estimation_Date', 'Resolution_Date', 'Last_Updated', 'Story_Point', 'Timespent', 'In_Progress_Minutes', 'Total_Effort_Minutes', 'Resolution_Time_Minute', 'Title_Changed_After_Estimation', 'Description_Changed_After_Estimation', 'Story_Point_Changed_After_Estimation', 'Pull_Request_URL', 'Creator_ID', 'Reporter_ID', 'Assignee_ID', 'Sprint_ID', 'Project_ID']

Issues: 200
Sprints: 100
Comments: 100
Issue Links: 100
Change Logs: 100


## 3. Data Preprocessing and Feature Engineering

### 3.1 Filter Completed Sprints and Calculate Target Variable (Task_Rank)

In [29]:
# Merge Issue and Sprint data
merged_df = issue_df.merge(sprint_df, on='Sprint_ID', how='inner')

# Filter for completed sprints with resolved issues
completed_states = ['closed', 'complete', 'Closed', 'Complete']
filtered_df = merged_df[
    (merged_df['State'].isin(completed_states)) & 
    (merged_df['Resolution_Date'].notna())
].copy()

print(f"Filtered to {len(filtered_df)} issues in completed sprints")

# Convert Resolution_Date to datetime
filtered_df['Resolution_Date'] = pd.to_datetime(filtered_df['Resolution_Date'])

# Calculate Task_Rank (target variable)
filtered_df = filtered_df.sort_values(['Sprint_ID', 'Resolution_Date'])
filtered_df['Task_Rank'] = filtered_df.groupby('Sprint_ID').cumcount() + 1

print(f"Task_Rank statistics:")
print(filtered_df['Task_Rank'].describe())

Filtered to 39 issues in completed sprints
Task_Rank statistics:
count    39.000000
mean      5.538462
std       3.093679
min       1.000000
25%       3.000000
50%       5.000000
75%       8.000000
max      12.000000
Name: Task_Rank, dtype: float64


### 3.2 Priority Encoding (Ordinal)

In [30]:
# Priority mapping
priority_map = {
    'Blocker': 5,
    'Critical': 4,
    'Major': 3,
    'Minor': 2,
    'Trivial': 1
}

filtered_df['Priority_Encoded'] = filtered_df['Priority'].map(priority_map).fillna(0)

print("Priority encoding distribution:")
print(filtered_df['Priority_Encoded'].value_counts().sort_index())

Priority encoding distribution:
Priority_Encoded
0.0    28
4.0    11
Name: count, dtype: int64


### 3.3 Task Age Calculation

In [31]:
#  Convert dates to datetime
filtered_df['Start_Date'] = pd.to_datetime(filtered_df['Start_Date'])
filtered_df['Creation_Date'] = pd.to_datetime(filtered_df['Creation_Date'])

# Calculate Task_Age in days
filtered_df['Task_Age'] = (filtered_df['Start_Date'] - filtered_df['Creation_Date']).dt.days

# Handle negative values (issues created after sprint start)
filtered_df['Task_Age'] = filtered_df['Task_Age'].clip(lower=0)

print(f"Task_Age statistics:")
print(filtered_df['Task_Age'].describe())

Task_Age statistics:
count     39.000000
mean     329.717949
std      328.436644
min        0.000000
25%        0.000000
50%      318.000000
75%      640.500000
max      971.000000
Name: Task_Age, dtype: float64


### 3.4 Count Features (Comments, Links, Changes)

In [32]:
#  Count comments per issue÷
# Note: Using 'ID' from issue.csv to match with 'Issue_ID' in other tables
num_comments = comment_df.groupby('Issue_ID').size().reset_index(name='num_comments')
filtered_df = filtered_df.merge(num_comments, left_on='ID', right_on='Issue_ID', how='left')
filtered_df['num_comments'] = filtered_df['num_comments'].fillna(0)

# Count links per issue
num_links = issue_links_df.groupby('Issue_ID').size().reset_index(name='num_links')
filtered_df = filtered_df.merge(num_links, left_on='ID', right_on='Issue_ID', how='left', suffixes=('', '_links'))
filtered_df['num_links'] = filtered_df['num_links'].fillna(0)

# Count changes per issue
num_changes = change_log_df.groupby('Issue_ID').size().reset_index(name='num_changes')
filtered_df = filtered_df.merge(num_changes, left_on='ID', right_on='Issue_ID', how='left', suffixes=('', '_changes'))
filtered_df['num_changes'] = filtered_df['num_changes'].fillna(0)

print("Count features summary:")
print(f"num_comments: {filtered_df['num_comments'].mean():.2f}")
print(f"num_links: {filtered_df['num_links'].mean():.2f}")
print(f"num_changes: {filtered_df['num_changes'].mean():.2f}")

Count features summary:
num_comments: 0.64
num_links: 0.46
num_changes: 0.56


### 3.6 Prepare Features and Handle Missing Values

In [33]:
# Fill missing Story Points and Total Effort
filtered_df['Story_Point'] = filtered_df['Story_Point'].fillna(filtered_df['Story_Point'].median())
filtered_df['Total_Effort_Minutes'] = filtered_df['Total_Effort_Minutes'].fillna(
    filtered_df['Total_Effort_Minutes'].median()
)

print("Missing values after imputation:")
print(filtered_df[['Story_Point', 'Total_Effort_Minutes', 'Task_Age']].isnull().sum())



Missing values after imputation:
Story_Point             0
Total_Effort_Minutes    0
Task_Age                0
dtype: int64


## 4. Train-Test Split (GroupKFold Strategy)

In [34]:
# Use GroupShuffleSplit to ensure sprints are not split between train and test
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(filtered_df, groups=filtered_df['Sprint_ID']))

train_df = filtered_df.iloc[train_idx].copy()
test_df = filtered_df.iloc[test_idx].copy()

print(f"Training set: {len(train_df)} issues from {train_df['Sprint_ID'].nunique()} sprints")
print(f"Test set: {len(test_df)} issues from {test_df['Sprint_ID'].nunique()} sprints")



Training set: 27 issues from 3 sprints
Test set: 12 issues from 1 sprints


## 5. Target Encoding (Assignee and Component)

In [35]:
# WARNING: Simple mean target encoding on training set - potential for minor leakage
# For production, use cross-validation based encoding (e.g., category_encoders.TargetEncoder)

# Assignee target encoding
assignee_means = train_df.groupby('Assignee_ID')['Task_Rank'].mean().to_dict()
global_mean_rank = train_df['Task_Rank'].mean()

train_df['Assignee_Encoded'] = train_df['Assignee_ID'].map(assignee_means).fillna(global_mean_rank)
test_df['Assignee_Encoded'] = test_df['Assignee_ID'].map(assignee_means).fillna(global_mean_rank)

print("Target encoding completed")
print(f"Unique assignees: {len(assignee_means)}")
print(f"Global mean rank: {global_mean_rank:.2f}")

Target encoding completed
Unique assignees: 22
Global mean rank: 5.11


## 6. Prepare Feature Matrix

In [36]:
# Define feature columns (without Component_Encoded)
feature_columns = [
    'Story_Point',
    'Total_Effort_Minutes',
    'Task_Age',
    'num_comments',
    'num_links',
    'num_changes',
    'Priority_Encoded',
    'Assignee_Encoded'
]

X_train = train_df[feature_columns]
y_train = train_df['Task_Rank']

X_test = test_df[feature_columns]
y_test = test_df['Task_Rank']

print(f"Feature matrix shape: {X_train.shape}")
print(f"\nFeatures: {feature_columns}")
print(f"\nFeature statistics:")
print(X_train.describe())


Feature matrix shape: (27, 8)

Features: ['Story_Point', 'Total_Effort_Minutes', 'Task_Age', 'num_comments', 'num_links', 'num_changes', 'Priority_Encoded', 'Assignee_Encoded']

Feature statistics:
       Story_Point  Total_Effort_Minutes    Task_Age  num_comments  num_links  \
count    27.000000             27.000000   27.000000     27.000000  27.000000   
mean      6.229630           4168.933333  419.666667      0.740741   0.407407   
std       3.719679           2655.867288  345.360347      1.129758   0.636049   
min       1.200000            359.260000    0.000000      0.000000   0.000000   
25%       3.050000           2192.650000   44.000000      0.000000   0.000000   
50%       6.300000           3426.190000  385.000000      0.000000   0.000000   
75%       8.300000           5948.010000  761.500000      1.000000   1.000000   
max      12.700000           9856.800000  971.000000      4.000000   2.000000   

       num_changes  Priority_Encoded  Assignee_Encoded  
count    27.000

## 7. Train XGBoost Model

In [37]:
# Initialize and train XGBoost Regressor
print("Training XGBoost model...")

model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

print("Model training completed!")


Training XGBoost model...
Model training completed!


## 8. Model Evaluation

### 8.1 RMSE Calculation

In [38]:

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Training RMSE: {train_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


Training RMSE: 0.0008
Test RMSE: 5.3548


### 8.2 NDCG@5 Calculation

In [39]:
def dcg_at_k(relevance_scores, k):
    """Calculate DCG@K"""
    relevance_scores = np.array(relevance_scores)[:k]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0

def ndcg_at_k(y_true, y_pred, k=5):
    """Calculate NDCG@K for a single sprint"""
    # Sort by predicted scores (ascending - lower rank is better)
    order = np.argsort(y_pred)
    y_true_sorted = np.array(y_true)[order]
    
    # Relevance: 1/Task_Rank (earlier tasks are more relevant)
    relevance = 1.0 / y_true_sorted
    
    # Ideal relevance (sort by true ranks)
    ideal_order = np.argsort(y_true)
    y_true_ideal = np.array(y_true)[ideal_order]
    ideal_relevance = 1.0 / y_true_ideal
    
    dcg = dcg_at_k(relevance, k)
    idcg = dcg_at_k(ideal_relevance, k)
    
    return dcg / idcg if idcg > 0 else 0.0

def mean_ndcg_at_k(test_df, predictions, k=5):
    """Calculate Mean NDCG@K across all sprints"""
    test_df = test_df.copy()
    test_df['predictions'] = predictions
    
    ndcg_scores = []
    for sprint_id, group in test_df.groupby('Sprint_ID'):
        if len(group) >= 2:  # Need at least 2 items to rank
            ndcg = ndcg_at_k(group['Task_Rank'].values, group['predictions'].values, k)
            ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores) if ndcg_scores else 0.0

# Calculate Mean NDCG@5
mean_ndcg = mean_ndcg_at_k(test_df, y_pred_test, k=5)
print(f"\nMean NDCG@5 on Test Set: {mean_ndcg:.4f}")


Mean NDCG@5 on Test Set: 0.5136


### 8.3 Feature Importance

In [40]:
# Display feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
                feature    importance
7      Assignee_Encoded  4.570603e-01
5           num_changes  3.864226e-01
0           Story_Point  1.225198e-01
2              Task_Age  2.849213e-02
1  Total_Effort_Minutes  5.315904e-03
3          num_comments  1.885932e-04
4             num_links  4.694041e-07
6      Priority_Encoded  0.000000e+00


### 8.4 Sample Predictions Analysis

In [41]:
#  Analyze predictions on a sample sprint
sample_sprint = test_df.groupby('Sprint_ID').size().idxmax()  # Get sprint with most issues
sample_sprint_df = test_df[test_df['Sprint_ID'] == sample_sprint].copy()
sample_sprint_df['Predicted_Rank'] = model.predict(sample_sprint_df[feature_columns])

print(f"\nSample Sprint Analysis: {sample_sprint}")
print(f"Number of issues: {len(sample_sprint_df)}")
print("\nTrue vs Predicted Ranks:")
comparison = sample_sprint_df[['Issue_Key', 'Task_Rank', 'Predicted_Rank', 'Priority', 'Story_Point']].sort_values('Task_Rank')
print(comparison.head(10))



Sample Sprint Analysis: 5
Number of issues: 12

True vs Predicted Ranks:
    Issue_Key  Task_Rank  Predicted_Rank  Priority  Story_Point
10   ISSUE-53          1        3.431782  Critical          9.9
11   ISSUE-74          2        9.966634      High          6.0
12  ISSUE-149          3        3.046903  Critical         11.3
13   ISSUE-15          4        3.444269    Medium          5.6
14  ISSUE-199          5        7.754230      High          2.7
15   ISSUE-33          6        4.364379       Low          3.3
16  ISSUE-193          7        4.544004    Medium          1.2
17   ISSUE-39          8        2.873660    Medium         10.5
18   ISSUE-58          9        5.106621       Low          4.9
19  ISSUE-100         10        3.465084       Low         10.2


## 9. Save Model and Encoders

In [42]:
# Save model
joblib.dump(model, 'task_reorder_model.pkl')

# Save encoders and mappings
encoders = {
    'priority_map': priority_map,
    'assignee_means': assignee_means,
    'global_mean_rank': global_mean_rank,
    'feature_columns': feature_columns,
    'story_point_median': filtered_df['Story_Point'].median(),
    'effort_median': filtered_df['Total_Effort_Minutes'].median()
}

joblib.dump(encoders, 'task_reorder_encoders.pkl')

print("\nModel and encoders saved successfully!")
print("- task_reorder_model.pkl")
print("- task_reorder_encoders.pkl")


Model and encoders saved successfully!
- task_reorder_model.pkl
- task_reorder_encoders.pkl


## 10. Summary

In [43]:
print("="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"Total Issues Processed: {len(filtered_df)}")
print(f"Training Issues: {len(train_df)}")
print(f"Test Issues: {len(test_df)}")
print(f"\nModel Performance:")
print(f"  - Training RMSE: {train_rmse:.4f}")
print(f"  - Test RMSE: {test_rmse:.4f}")
print(f"  - Mean NDCG@5: {mean_ndcg:.4f}")
print(f"\nTop 3 Important Features:")
for idx, row in feature_importance.head(3).iterrows():
    print(f"  - {row['feature']}: {row['importance']:.4f}")
print("="*60)

TRAINING SUMMARY
Total Issues Processed: 39
Training Issues: 27
Test Issues: 12

Model Performance:
  - Training RMSE: 0.0008
  - Test RMSE: 5.3548
  - Mean NDCG@5: 0.5136

Top 3 Important Features:
  - Assignee_Encoded: 0.4571
  - num_changes: 0.3864
  - Story_Point: 0.1225
