In [1]:
# Import library
import pandas as pd

In [2]:
# Example evaluation_results table
evaluation_results = pd.DataFrame({
    "project_id": [1, 1, 2, 2, 3, 3, 4, 4],
    "model_id": [1, 2, 1, 2, 1, 3, 2, 3],
    "date": ["2024-01-02", "2024-01-02", "2024-01-03", "2024-01-03",
             "2024-01-04", "2024-01-04", "2024-01-05", "2024-01-05"],
    "task_id": [1, 1, 2, 2, 3, 3, 4, 4],
    "user_id": [1, 2, 3, 4, 5, 6, 7, 8],
    "score": ["good", "bad", "good", None, "bad", "good", "good", None]
})

In [6]:
evaluation_results

Unnamed: 0,project_id,model_id,date,task_id,user_id,score
0,1,1,2024-01-02,1,1,good
1,1,2,2024-01-02,1,2,bad
2,2,1,2024-01-03,2,3,good
3,2,2,2024-01-03,2,4,
4,3,1,2024-01-04,3,5,bad
5,3,3,2024-01-04,3,6,good
6,4,2,2024-01-05,4,7,good
7,4,3,2024-01-05,4,8,


In [3]:
# Example metadata tables
models_metadata = pd.DataFrame({
    "model_id": [1, 2, 3],
    "model_name": ["Alpha", "Beta", "Gamma"],
    "model_type": ["Type A", "Type B", "Type C"]
})

In [7]:
models_metadata

Unnamed: 0,model_id,model_name,model_type
0,1,Alpha,Type A
1,2,Beta,Type B
2,3,Gamma,Type C


In [4]:
tasks_metadata = pd.DataFrame({
    "task_id": [1, 2, 3, 4],
    "task_description": ["Classification", "Regression", "Clustering", "Forecasting"],
    "task_category": ["Machine Learning", "Statistics", "AI", "Time Series"]
})

In [8]:
tasks_metadata

Unnamed: 0,task_id,task_description,task_category
0,1,Classification,Machine Learning
1,2,Regression,Statistics
2,3,Clustering,AI
3,4,Forecasting,Time Series


In [5]:
# Calculate Good Ratios
# Filter non-null scores
filtered_results = evaluation_results.dropna(subset=["score"])
filtered_results["good_score"] = filtered_results["score"].apply(lambda x: 1 if x == "good" else 0)

# Aggregate results
model_summary = filtered_results.groupby("model_id").agg(
    total_scores=("score", "count"),
    good_scores=("good_score", "sum")
).reset_index()
model_summary["good_ratio"] = model_summary["good_scores"] / model_summary["total_scores"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_results["good_score"] = filtered_results["score"].apply(lambda x: 1 if x == "good" else 0)


In [9]:
model_summary

Unnamed: 0,model_id,total_scores,good_scores,good_ratio
0,1,3,2,0.666667
1,2,2,1,0.5
2,3,1,1,1.0


In [10]:
# Enhanced Analysis
# Merge with metadata for enhanced analysis
enhanced_analysis = pd.merge(filtered_results, models_metadata, on="model_id")
enhanced_analysis = pd.merge(enhanced_analysis, tasks_metadata, on="task_id")

# Group by dimensions
enhanced_summary = enhanced_analysis.groupby(
    ["model_id", "model_name", "model_type", "task_category"]
).agg(
    total_scores=("score", "count"),
    good_scores=("good_score", "sum")
).reset_index()

enhanced_summary["good_ratio"] = enhanced_summary["good_scores"] / enhanced_summary["total_scores"]

In [11]:
enhanced_summary

Unnamed: 0,model_id,model_name,model_type,task_category,total_scores,good_scores,good_ratio
0,1,Alpha,Type A,AI,1,0,0.0
1,1,Alpha,Type A,Machine Learning,1,1,1.0
2,1,Alpha,Type A,Statistics,1,1,1.0
3,2,Beta,Type B,Machine Learning,1,0,0.0
4,2,Beta,Type B,Time Series,1,1,1.0
5,3,Gamma,Type C,AI,1,1,1.0
