In [8]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

#multilabel class ?
'''(correct treatment, correct phase, correct rationale).
 each label carries a different penalty value, 
and the goal is to predict the correct labels and calculate a final score based on penalties.'''


student_answers = [
    "Tooth #15 – extraction\nPhase: Phase 0",
    "Tooth #15 – root canal treatment\nPhase: Phase 1",
    "Tooth #15 – extraction with site prevention\nPhase: Phase 1",
    "Tooth #15 – extraction\nPhase: Phase 0",
    "Tooth #15 – extraction\nPhase: Phase 0",
    "Tooth #15 – root canal treatment\nPhase: Phase 1",
    "Tooth #15 – root canal treatment\nPhase: Phase 2",
  
]


labels = [0, 1, 0, 0, 0 , 1, 1]  

X_train, X_test, y_train, y_test = train_test_split(student_answers, labels, test_size=0.2, random_state=42)
print(len(X_train), len(y_train))

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
correct_percentage = accuracy * 100


penalty_05 = 0.5
penalty_10 = 1.0
penalty_20 = 2.0


actual_surfaces = ["MO", "DO"]  
actual_treatments = ["Extraction", "Extraction"]
actual_phases = ["Phase 0", "Phase 1"]  

predicted_surfaces = ["MO", "DO"]  
predicted_treatments = ["Extraction", "Endodontic therapy"] 
predicted_phases = ["Phase 0", "Phase 0"]  

total_penalties = []
for i, answer in enumerate(X_test):
    actual_surface = actual_surfaces[i]
    actual_treatment = actual_treatments[i]
    actual_phase = actual_phases[i]
    
    predicted_surface = predicted_surfaces[i]
    predicted_treatment = predicted_treatments[i]
    predicted_phase = predicted_phases[i]
  
    total_penalty = sum([
        penalty_05 * (predicted_surface != actual_surface),
        penalty_10 * (predicted_treatment != actual_treatment),
        penalty_20 * (predicted_phase != actual_phase)
    ])
    
    total_penalties.append(total_penalty)

final_scores = [10 - penalty for penalty in total_penalties]

for i, answer in enumerate(X_test):
    print(f"Student Answer: {answer}")
    print(f"Predicted Label: {y_pred[i]}")
    print(f"Actual Surface: {actual_surfaces[i]}, Predicted Surface: {predicted_surfaces[i]}")
    print(f"Actual Treatment: {actual_treatments[i]}, Predicted Treatment: {predicted_treatments[i]}")
    print(f"Actual Phase: {actual_phases[i]}, Predicted Phase: {predicted_phases[i]}")
    print(f"Total Penalty: {total_penalties[i]:.2f}")
    print(f"Final Score: {final_scores[i]:.2f}\n")


5 5
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Student Answer: Tooth #15 – extraction
Phase: Phase 0
Predicted Label: 0
Actual Surface: MO, Predicted Surface: MO
Actual Treatment: Extraction, Predicted Treatment: Extraction
Actual Phase: Phase 0, Predicted Phase: Phase 0
Total Penalty: 0.00
Final Score: 10.00

Student Answer: Tooth #15 – root canal treatment
Phase: Phase 1
Predicted Label: 1
Actual Surface: DO, Predicted Surface: DO
Actual Treatment: Extraction, Predicted Treatment: Endodontic therapy
Actual Phase: Phase 1, Predicted Phase: Phase 0
Total Penalty: 3.00
Final Score: 7.00

