In [19]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [20]:
pirads_df = pd.read_excel("PI-RADS_bis_26_01_2024.xlsx")
pirads_df.rename(columns={"Pat-ID": "ALTAPatientID", 
                          "Befunddatum": "StudyDate"}, inplace=True)

class_df = pd.read_csv("prostate_class_dataset_demography_final_psa_vol_20240828_feat.csv", sep=";")
class_df = class_df[["ALTAPatientID", "StudyDate", "study_orthanc_id", "csPCa"]]

In [21]:
class_df

Unnamed: 0,ALTAPatientID,StudyDate,study_orthanc_id,csPCa
0,12724,2012-04-18,fecf8784-dbb3c5d4-1142b881-b6990853-955cd99c,True
1,25525,2019-05-23,e5be4a92-0dea82f9-5239b10d-0b2268e3-db270b2f,True
2,33741,2022-07-15,7f7489ee-372560ed-ce9717de-41ed19a1-96ffea0e,True
3,20932,2017-03-25,ef8424db-4ceeda12-c978c9d2-dbf4283a-6b8b7665,True
4,21579,2017-08-31,22fbfe91-50fc8adb-32e53b0a-cd18fb53-5a1421de,True
...,...,...,...,...
2352,16651,2018-04-12,a8bdedd1-4d3a2d64-92eee687-2a472618-52fd92b1,False
2353,26542,2019-10-15,c0135203-bbccdfb1-6df31419-4fa1c906-66d79b35,False
2354,26210,2021-02-11,abc6052f-0ef77557-8a64a018-41b47b77-d3229bc6,False
2355,29904,2021-03-15,db00b468-fc6ca3c0-48f7087c-c2a19702-a75140da,False


In [22]:
pirads_df['StudyDate'] = pd.to_datetime(pirads_df['StudyDate'])
class_df['StudyDate'] = pd.to_datetime(class_df['StudyDate'])

In [23]:
# Merge the tables on PatientID
merged_df = pd.merge(pirads_df, class_df, on='ALTAPatientID', suffixes=('_pirads', '_mri'))

# Calculate the absolute difference in days between the dates
merged_df['date_diff'] = abs((merged_df['StudyDate_pirads'] - merged_df['StudyDate_mri']).dt.days)

# Filter the merged DataFrame to include only rows where the date difference is within 30 days
merged_df = merged_df[merged_df['date_diff'] <= 30]

# Group by PatientID and keep the row with the smallest date difference
closest_studies = merged_df.loc[merged_df.groupby('ALTAPatientID')['date_diff'].idxmin()]


In [24]:
pirads_df_histo = closest_studies.reset_index()

pirads_df_histo

Unnamed: 0,index,ID,ALTAPatientID,StudyDate_pirads,Befundname,PI-RADS,StudyDate_mri,study_orthanc_id,csPCa,date_diff
0,385,1895,1075,2018-04-09,mrpro3_001075_090418_0001.docx,4,2018-04-09,71e96a45-a7ec73d8-9be36d79-bd00d430-39977599,False,0
1,414,2002,1838,2019-09-16,mrpro3_001838_160919_0001.docx,4,2019-09-16,a4eaeb07-bf437a45-949cc82c-1c9d901e-9c4147a3,True,0
2,415,2003,4045,2019-08-26,mrpro3_004045_260819_0001.docx,4,2019-08-23,7553a386-2fb62840-1381c008-b8a9064d-57d04de2,False,3
3,47,222,4141,2018-06-12,mrpro3_004141_120618_0001.docx,5,2018-06-11,6e7cef6a-0a96be40-6051df4e-5da0c78d-07658ade,True,1
4,333,1585,4164,2019-11-12,mrpro4_004164_121119_0001.docx,4,2019-11-12,f1f4575b-e7c56a9e-1780ed41-c7e16ab2-255ddbe9,False,0
...,...,...,...,...,...,...,...,...,...,...
540,566,2771,41086,2023-02-14,mrpro3_041086_140223_0001.docx,4,2023-02-14,46c1dc89-39036f09-929d372d-79ced5fb-cc3ed639,True,0
541,561,2747,41098,2023-02-27,mrpro4_041098_270223_0001.docx,4,2023-02-24,dfec2c03-07f6debd-9b0b9963-5c36fb34-aed51467,True,3
542,467,2228,41174,2023-02-23,mrpro3_041174_230223_0001.docx,4,2023-02-23,537953bf-ca35e071-8eaed734-6fcb4f09-a0038462,True,0
543,510,2432,41197,2023-03-06,mrpro3_041197_060323_0001.docx,4,2023-03-04,17cad4bd-f228aa20-6f6d36ca-cd31d0b7-c7b345b1,True,2


In [25]:
# pirads_df_histo = pd.merge(class_df, pirads_df, on=['ALTAPatientID', 'StudyDate'], how='right')

In [26]:
pirads_df_histo = pirads_df_histo[~pirads_df_histo["csPCa"].isna()]

In [27]:
pirads_df_histo['PI-RADS_gt_3'] = pirads_df_histo['PI-RADS'] > 3
pirads_df_histo['PI-RADS_gt_4'] = pirads_df_histo['PI-RADS'] > 4

In [28]:
pirads_df_histo[pirads_df_histo['PI-RADS_gt_3'].isna()]

Unnamed: 0,index,ID,ALTAPatientID,StudyDate_pirads,Befundname,PI-RADS,StudyDate_mri,study_orthanc_id,csPCa,date_diff,PI-RADS_gt_3,PI-RADS_gt_4


In [32]:
pirads_df_histo.to_csv("prostate_test_set_pirads.csv", sep=";")

In [30]:

# Assuming the binary columns are named 'predicted' and 'actual'
predicted = pirads_df_histo['PI-RADS_gt_3'].to_numpy().astype(int)
actual = pirads_df_histo['csPCa'].to_numpy().astype(int)

# Calculate precision and recall
precision = precision_score(actual, predicted)
recall = recall_score(actual, predicted)
acc = accuracy_score(actual, predicted)
cm = confusion_matrix(actual, predicted)

print(f"PI-RADS>3 -> csPCa ({len(actual)} samples)")
print(f"Recall: {recall:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Acc: {acc:.3f}")
print("\nConfusion Matrix:")
print(cm)

PI-RADS>3 -> csPCa (545 samples)
Recall: 0.985
Precision: 0.637
Acc: 0.651

Confusion Matrix:
[[ 30 185]
 [  5 325]]


In [31]:

# Assuming the binary columns are named 'predicted' and 'actual'
predicted = pirads_df_histo['PI-RADS_gt_4'].to_numpy().astype(int)
actual = pirads_df_histo['csPCa'].to_numpy().astype(int)

# Calculate precision and recall
precision = precision_score(actual, predicted)
recall = recall_score(actual, predicted)
acc = accuracy_score(actual, predicted)
cm = confusion_matrix(actual, predicted)

print(f"PI-RADS>4 -> csPCa ({len(actual)} samples)")
print(f"Recall: {recall:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Acc: {acc:.3f}")
print("\nConfusion Matrix:")
print(cm)

PI-RADS>4 -> csPCa (545 samples)
Recall: 0.603
Precision: 0.783
Acc: 0.659

Confusion Matrix:
[[160  55]
 [131 199]]


- csPCa 215

- non cs 330