In [71]:
import os
import math
import random
import functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### IARC Image Bank Colpo

In [72]:
iarc_colpo_datapath = '/workspace/experiments/vit/datasets/IARCImageBankColpo/'
df_c = pd.read_excel(iarc_colpo_datapath+'cases_metadata.xlsx', header=1)

In [73]:
df_c = df_c.loc[:, ['Case Number', 'Histopathology']]
df_c

Unnamed: 0,Case Number,Histopathology
0,1,Not done
1,2,Not done
2,3,Not done
3,4,Not done
4,5,Not done
...,...,...
195,196,HSIL-CIN3
196,197,HSIL-CIN3
197,198,Not done
198,199,Not done


In [74]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Case Number     200 non-null    int64 
 1   Histopathology  200 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.2+ KB


In [75]:
df_c['Histopathology'].unique()

array(['Not done', 'Normal',
       'Tuberculosis of cervix with giant cells and granulation tissue',
       'LSIL-CIN1', 'HSIL-CIN2', 'HSIL-CIN3',
       'HSIL-CIN3; LSIL-CIN1 in the anterior lip',
       'Cervical histopathology was invasive squamous cell cancer, vaginal histopathology was HSIL-VAIN 3, and vulvar histopathology was HSIL-VIN 3',
       'Microinvasive squamous cell cancer',
       'Invasive squamous cell carcinoma',
       'Microinvasive squamous cell carcinoma', 'Squamous cell carcinoma',
       'Adenocarcinoma in situ', 'Adenocarcinoma', 'Mucous polyp',
       'LSIL-HPV changes'], dtype=object)

In [76]:
investigated_labels = ['Not done', 'Normal', 'CIN1', 'CIN2', 'CIN3']

In [77]:
df_c = df_c[df_c.Histopathology.apply(lambda d: any(l in d for l in investigated_labels))]
df_c['Histopathology'].unique()

array(['Not done', 'Normal', 'LSIL-CIN1', 'HSIL-CIN2', 'HSIL-CIN3',
       'HSIL-CIN3; LSIL-CIN1 in the anterior lip'], dtype=object)

In [78]:
df_c = df_c.loc[df_c['Histopathology'] != 'HSIL-CIN3; LSIL-CIN1 in the anterior lip']

In [79]:
df_c['Histopathology'].unique()

array(['Not done', 'Normal', 'LSIL-CIN1', 'HSIL-CIN2', 'HSIL-CIN3'],
      dtype=object)

In [80]:
# normal cases
df_normal = df_c.query("Histopathology == 'Normal' or Histopathology == 'Not done'")
normal_cases_ids = df_normal['Case Number'].unique()

# abnormal cases
df_cin1 = df_c[df_c.Histopathology.apply(lambda d: any(l in d for l in ['CIN1']))]
cin1_cases_ids = df_cin1['Case Number'].unique()

df_cin2 = df_c[df_c.Histopathology.apply(lambda d: any(l in d for l in ['CIN2']))]
cin2_cases_ids = df_cin2['Case Number'].unique()

df_cin3 = df_c[df_c.Histopathology.apply(lambda d: any(l in d for l in ['CIN3']))]
cin3_cases_ids = df_cin3['Case Number'].unique()

In [81]:
df_c_images = pd.read_excel(iarc_colpo_datapath+'cases_images.xlsx')
df_c_images.head()

Unnamed: 0,Case Number,File,Sort,Type
0,1,AAAW1.jpg,1,After normal saline
1,1,AAAW3.jpg,2,After acetic acid
2,1,AAAW2.jpg,3,After acetic acid with green filter
3,1,AAAW4.jpg,4,After Lugol’s iodine
4,2,AADE0.jpg,1,After acetic acid


In [82]:
df_c_images[df_c_images['Case Number'] == 42]

Unnamed: 0,Case Number,File,Sort,Type
180,42,AADF0.jpg,1,Speculum examination
181,42,AADF1.jpg,2,After normal saline with green filter
182,42,AADF5.jpg,3,After acetic acid
183,42,AADF4.jpg,4,After acetic acid with higher magnification
184,42,AAFN5.jpg,5,After Lugol’s iodine


In [83]:
#excluding image it's not present in the 
df_c_images = df_c_images[df_c_images['File'] != 'AAFN5.jpg']

In [84]:
df_c_images[df_c_images['Case Number'] == 42]

Unnamed: 0,Case Number,File,Sort,Type
180,42,AADF0.jpg,1,Speculum examination
181,42,AADF1.jpg,2,After normal saline with green filter
182,42,AADF5.jpg,3,After acetic acid
183,42,AADF4.jpg,4,After acetic acid with higher magnification


In [85]:
df_colpo = df_c_images.loc[:, ['Case Number', 'File']]

In [86]:
directory_cases_names_colpo = functions.rename_directories_dataframe(df_colpo, 'colpo')

In [87]:
df_colpo['Case directory'] = directory_cases_names_colpo
df_colpo.head()

Unnamed: 0,Case Number,File,Case directory
0,1,AAAW1.jpg,Case_001
1,1,AAAW3.jpg,Case_001
2,1,AAAW2.jpg,Case_001
3,1,AAAW4.jpg,Case_001
4,2,AADE0.jpg,Case_002


In [88]:
diagnosis_colpo = np.zeros(len(df_colpo), dtype=int)
df_colpo['Diagnosis'] = diagnosis_colpo

In [89]:
df_colpo['Diagnosis'] = np.where(df_c_images['Case Number'].isin(normal_cases_ids), 0, df_colpo['Diagnosis'])
df_colpo['Diagnosis'] = np.where(df_c_images['Case Number'].isin(cin1_cases_ids), 1, df_colpo['Diagnosis'])
df_colpo['Diagnosis'] = np.where(df_c_images['Case Number'].isin(cin2_cases_ids), 2, df_colpo['Diagnosis'])
df_colpo['Diagnosis'] = np.where(df_c_images['Case Number'].isin(cin3_cases_ids), 3, df_colpo['Diagnosis'])

In [90]:
df_colpo['Diagnosis'].unique()

array([0, 1, 2, 3])

In [91]:
df_colpo['File'] = iarc_colpo_datapath+df_colpo['Case directory']+'/'+df_colpo['File']
df_colpo = df_colpo.drop('Case directory', axis=1)

In [92]:
df_colpo.values

array([[1,
        '/workspace/experiments/vit/datasets/IARCImageBankColpo/Case_001/AAAW1.jpg',
        0],
       [1,
        '/workspace/experiments/vit/datasets/IARCImageBankColpo/Case_001/AAAW3.jpg',
        0],
       [1,
        '/workspace/experiments/vit/datasets/IARCImageBankColpo/Case_001/AAAW2.jpg',
        0],
       ...,
       [200,
        '/workspace/experiments/vit/datasets/IARCImageBankColpo/Case_200/AAGT2.jpg',
        0],
       [200,
        '/workspace/experiments/vit/datasets/IARCImageBankColpo/Case_200/AAGT4.jpg',
        0],
       [200,
        '/workspace/experiments/vit/datasets/IARCImageBankColpo/Case_200/AAGT5.jpg',
        0]], dtype=object)

### IARC Image Bank VIA

In [93]:
iarc_via_datapath = '/workspace/experiments/vit/datasets/IARCImageBankVIA/'
df_v = pd.read_excel(iarc_via_datapath+'cases_metadata.xlsx')

In [94]:
df_v.head(5)

Unnamed: 0,CaseNumber,CaseID,SCJ,SCJ Location,Acitowhite area,Acitowhite area color,Acitowhite area margin,Acitowhite area surface,Acitowhite area location,Acitowhite area size,VIA,Eligibility for ablative treatment,Histology findings
0,1,AFC,Fully visible,On ectocervix,Present,Thin,Diffuse,Smooth,Within TZ or close to the external os (if SCJ ...,Covering more than 75% of ectocervix,Negative,Eligible for ablation,Not done
1,2,AJL,Fully visible,On ectocervix,Absent,,,,,,Negative,Eligible for ablation,Not done
2,3,AGY,Partially visible,Partly on ectocervix and partly on endocervix,Absent,,,,,,Negative,Eligible for ablation,Not done
3,4,AJE,Fully visible,On ectocervix,Absent,,,,,,Negative,Eligible for ablation,Not done
4,5,AHS,Fully visible,On ectocervix,Absent,,,,,,Negative,Eligible for ablation,Not done


In [95]:
df_v.rename(columns={'Histology findings': 'Histology_findings'}, inplace=True)

In [96]:
df_v.columns

Index(['CaseNumber', 'CaseID', 'SCJ', 'SCJ Location', 'Acitowhite area',
       'Acitowhite area color', 'Acitowhite area margin',
       'Acitowhite area surface', 'Acitowhite area location',
       'Acitowhite area size', 'VIA', 'Eligibility for ablative treatment',
       'Histology_findings'],
      dtype='object')

In [97]:
df_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 13 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   CaseNumber                          186 non-null    int64 
 1   CaseID                              186 non-null    object
 2   SCJ                                 186 non-null    object
 3   SCJ Location                        137 non-null    object
 4   Acitowhite area                     186 non-null    object
 5   Acitowhite area color               125 non-null    object
 6   Acitowhite area margin              125 non-null    object
 7   Acitowhite area surface             124 non-null    object
 8   Acitowhite area location            124 non-null    object
 9   Acitowhite area size                124 non-null    object
 10  VIA                                 186 non-null    object
 11  Eligibility for ablative treatment  186 non-null    object

In [98]:
df_v.describe()

Unnamed: 0,CaseNumber
count,186.0
mean,93.5
std,53.837719
min,1.0
25%,47.25
50%,93.5
75%,139.75
max,186.0


In [99]:
df_v = df_v.loc[:, ['CaseNumber', 'Histology_findings']]
df_v

Unnamed: 0,CaseNumber,Histology_findings
0,1,Not done
1,2,Not done
2,3,Not done
3,4,Not done
4,5,Not done
...,...,...
181,182,Squamous cell carcinoma
182,183,Adenocarcinoma in situ
183,184,Adenocarcinoma
184,185,Adenocarcinoma


In [100]:
df_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   CaseNumber          186 non-null    int64 
 1   Histology_findings  186 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.0+ KB


In [101]:
df_v['Histology_findings'].unique()

array(['Not done', 'Normal', 'LSIL-HPV changes', 'HSIL-CIN3', 'LSIL-CIN1',
       'HSIL-CIN2', 'Anterior lip: HSIL-CIN2; Posterior lip: LSIL-CIN1',
       'Microinvasive carcinoma',
       'Posterior lip: HSIL-CIN3; Anterior lip: normal',
       'Microinvasive squamous cell carcinoma',
       'Posterior lip: LSIL-HPV changes; Anterior lip: Normal',
       'Posterior lip: LSIL-CIN1; Anterior lip: LSIL-HPV changes',
       'Anterior lip (10 o’clock): LSIL-CIN1',
       'Posterior lip: LSIL-HPV changes', 'Squamous cell carcinoma',
       'Invasive squamous cell carcinoma', 'Adenocarcinoma in situ',
       'Adenocarcinoma'], dtype=object)

In [102]:
investigated_labels_via = ['Not done', 'Normal', 'CIN1', 'CIN2', 'CIN3']

In [103]:
df_v = df_v[df_v.Histology_findings.apply(lambda d: any(l in d for l in investigated_labels_via))]
df_v['Histology_findings'].unique()

array(['Not done', 'Normal', 'HSIL-CIN3', 'LSIL-CIN1', 'HSIL-CIN2',
       'Anterior lip: HSIL-CIN2; Posterior lip: LSIL-CIN1',
       'Posterior lip: HSIL-CIN3; Anterior lip: normal',
       'Posterior lip: LSIL-HPV changes; Anterior lip: Normal',
       'Posterior lip: LSIL-CIN1; Anterior lip: LSIL-HPV changes',
       'Anterior lip (10 o’clock): LSIL-CIN1'], dtype=object)

In [104]:
df_v = df_v.loc[df_v['Histology_findings'] != 'Anterior lip: HSIL-CIN2; Posterior lip: LSIL-CIN1']
df_v = df_v.loc[df_v['Histology_findings'] != 'Posterior lip: HSIL-CIN3; Anterior lip: normal']
df_v = df_v.loc[df_v['Histology_findings'] != 'Posterior lip: LSIL-HPV changes; Anterior lip: Normal']

In [105]:
# normal cases
df_normal_via = df_v.query("Histology_findings == 'Normal' or Histology_findings == 'Not done'")
normal_cases_ids_via = df_normal_via['CaseNumber'].unique()

# abnormal cases
df_cin1_via = df_v[df_v.Histology_findings.apply(lambda d: any(l in d for l in ['CIN1']))]
cin1_cases_ids_via = df_cin1_via['CaseNumber'].unique()

df_cin2_via = df_v[df_v.Histology_findings.apply(lambda d: any(l in d for l in ['CIN2']))]
cin2_cases_ids_via = df_cin2_via['CaseNumber'].unique()

df_cin3_via = df_v[df_v.Histology_findings.apply(lambda d: any(l in d for l in ['CIN3']))]
cin3_cases_ids_via = df_cin3_via['CaseNumber'].unique()

In [106]:
df_cin1_via

Unnamed: 0,CaseNumber,Histology_findings
107,108,LSIL-CIN1
132,133,Posterior lip: LSIL-CIN1; Anterior lip: LSIL-H...
134,135,LSIL-CIN1
135,136,LSIL-CIN1
136,137,LSIL-CIN1
137,138,Anterior lip (10 o’clock): LSIL-CIN1
138,139,LSIL-CIN1
141,142,LSIL-CIN1
142,143,LSIL-CIN1
143,144,LSIL-CIN1


In [107]:
df_images_via = pd.read_excel(iarc_via_datapath+'cases_images.xlsx')
df_images_via.head()

Unnamed: 0,CaseNumber,File,Type
0,1,AFC0.jpg,Before application of acetic acid
1,1,AFC1.jpg,After application of acetic acid
2,2,AJL0.jpg,Before application of acetic acid
3,2,AJL1.jpg,After application of acetic acid
4,3,AGY0.jpg,Before application of acetic acid


In [108]:
df_images_via.shape

(420, 3)

In [109]:
df_images_via[df_images_via['CaseNumber'] == 166]

Unnamed: 0,CaseNumber,File,Type
377,166,APW1.jpg,Before application of acetic acid
378,166,APW2.jpg,After application of acetic acid
379,166,APW3.jpg,After application of Lugol’s iodine


In [110]:
#excluding image it's not present in the 
df_images_via = df_images_via[df_images_via['CaseNumber'] != 166]

In [111]:
df_images_via.shape

(417, 3)

In [112]:
df_via = df_images_via.loc[:, ['CaseNumber', 'File']]

In [113]:
directory_cases_names_via = functions.rename_directories_dataframe(df_via, 'via')

In [114]:
df_via['Case directory'] = directory_cases_names_via
df_via.head()

Unnamed: 0,CaseNumber,File,Case directory
0,1,AFC0.jpg,Case_001
1,1,AFC1.jpg,Case_001
2,2,AJL0.jpg,Case_002
3,2,AJL1.jpg,Case_002
4,3,AGY0.jpg,Case_003


In [115]:
diagnosis_via = np.zeros(len(df_via), dtype=int)
df_via['Diagnosis'] = diagnosis_via

In [116]:
df_via['Diagnosis'] = np.where(df_images_via['CaseNumber'].isin(normal_cases_ids), 0, df_via['Diagnosis'])
df_via['Diagnosis'] = np.where(df_images_via['CaseNumber'].isin(cin1_cases_ids), 1, df_via['Diagnosis'])
df_via['Diagnosis'] = np.where(df_images_via['CaseNumber'].isin(cin2_cases_ids), 2, df_via['Diagnosis'])
df_via['Diagnosis'] = np.where(df_images_via['CaseNumber'].isin(cin3_cases_ids), 3, df_via['Diagnosis'])

In [117]:
df_via['Diagnosis'].unique()

array([0, 1, 2, 3])

In [118]:
df_via['File'] = iarc_via_datapath+df_via['Case directory']+'/'+df_via['File']
df_via = df_via.drop('Case directory', axis=1)

In [119]:
df_via.values

array([[1,
        '/workspace/experiments/vit/datasets/IARCImageBankVIA/Case_001/AFC0.jpg',
        0],
       [1,
        '/workspace/experiments/vit/datasets/IARCImageBankVIA/Case_001/AFC1.jpg',
        0],
       [2,
        '/workspace/experiments/vit/datasets/IARCImageBankVIA/Case_002/AJL0.jpg',
        0],
       ...,
       [185,
        '/workspace/experiments/vit/datasets/IARCImageBankVIA/Case_185/ADF1.jpg',
        0],
       [186,
        '/workspace/experiments/vit/datasets/IARCImageBankVIA/Case_186/ABK0.jpg',
        0],
       [186,
        '/workspace/experiments/vit/datasets/IARCImageBankVIA/Case_186/ABK1.jpg',
        0]], dtype=object)

In [120]:
df_via.describe(include=object)

Unnamed: 0,File
count,417
unique,417
top,/workspace/experiments/vit/datasets/IARCImageB...
freq,1


### Merging both dataframes

In [121]:
df_colpo

Unnamed: 0,Case Number,File,Diagnosis
0,1,/workspace/experiments/vit/datasets/IARCImageB...,0
1,1,/workspace/experiments/vit/datasets/IARCImageB...,0
2,1,/workspace/experiments/vit/datasets/IARCImageB...,0
3,1,/workspace/experiments/vit/datasets/IARCImageB...,0
4,2,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
900,199,/workspace/experiments/vit/datasets/IARCImageB...,0
901,200,/workspace/experiments/vit/datasets/IARCImageB...,0
902,200,/workspace/experiments/vit/datasets/IARCImageB...,0
903,200,/workspace/experiments/vit/datasets/IARCImageB...,0


In [122]:
df_colpo['Case Number'].unique().max()

200

In [123]:
df_via.rename(columns={'CaseNumber': 'Case Number'}, inplace=True)
df_via

Unnamed: 0,Case Number,File,Diagnosis
0,1,/workspace/experiments/vit/datasets/IARCImageB...,0
1,1,/workspace/experiments/vit/datasets/IARCImageB...,0
2,2,/workspace/experiments/vit/datasets/IARCImageB...,0
3,2,/workspace/experiments/vit/datasets/IARCImageB...,0
4,3,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
415,184,/workspace/experiments/vit/datasets/IARCImageB...,0
416,185,/workspace/experiments/vit/datasets/IARCImageB...,0
417,185,/workspace/experiments/vit/datasets/IARCImageB...,0
418,186,/workspace/experiments/vit/datasets/IARCImageB...,0


In [124]:
df_via['Case Number'] += 200

In [125]:
df_via['Case Number'].unique()

array([201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
       214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226,
       227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
       240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
       253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265,
       266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278,
       279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291,
       292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304,
       305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317,
       318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330,
       331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343,
       344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356,
       357, 358, 359, 360, 361, 362, 363, 364, 365, 367, 368, 369, 370,
       371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 38

In [126]:
df_general = pd.concat([df_colpo, df_via])
df_general = df_general.reset_index()
df_general

Unnamed: 0,index,Case Number,File,Diagnosis
0,0,1,/workspace/experiments/vit/datasets/IARCImageB...,0
1,1,1,/workspace/experiments/vit/datasets/IARCImageB...,0
2,2,1,/workspace/experiments/vit/datasets/IARCImageB...,0
3,3,1,/workspace/experiments/vit/datasets/IARCImageB...,0
4,4,2,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...,...
1316,415,384,/workspace/experiments/vit/datasets/IARCImageB...,0
1317,416,385,/workspace/experiments/vit/datasets/IARCImageB...,0
1318,417,385,/workspace/experiments/vit/datasets/IARCImageB...,0
1319,418,386,/workspace/experiments/vit/datasets/IARCImageB...,0


In [127]:
df_general = df_general.loc[:, ['Case Number', 'File', 'Diagnosis']]
df_general

Unnamed: 0,Case Number,File,Diagnosis
0,1,/workspace/experiments/vit/datasets/IARCImageB...,0
1,1,/workspace/experiments/vit/datasets/IARCImageB...,0
2,1,/workspace/experiments/vit/datasets/IARCImageB...,0
3,1,/workspace/experiments/vit/datasets/IARCImageB...,0
4,2,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
1316,384,/workspace/experiments/vit/datasets/IARCImageB...,0
1317,385,/workspace/experiments/vit/datasets/IARCImageB...,0
1318,385,/workspace/experiments/vit/datasets/IARCImageB...,0
1319,386,/workspace/experiments/vit/datasets/IARCImageB...,0


#### Split dataframe

In [128]:
total_cases = df_general['Case Number'].unique()

train_cases_aux, test_cases = functions.split_two(total_cases, train_ratio=0.8, val_ratio=0.2, seed=4619)
train_cases, val_cases = functions.split_two(train_cases_aux, train_ratio=0.7, val_ratio=0.3, seed=3724)

df_train = df_general[df_general['Case Number'].isin(train_cases)]
df_val = df_general[df_general['Case Number'].isin(val_cases)]
df_test = df_general[df_general['Case Number'].isin(test_cases)]

In [129]:
#checking if its alright
cases_inter_trainval = np.intersect1d(df_train['Case Number'].unique(), df_val['Case Number'].unique())
cases_inter_valtest = np.intersect1d(df_test['Case Number'].unique(), df_val['Case Number'].unique())
cases_inter_traintest = np.intersect1d(df_train['Case Number'].unique(), df_test['Case Number'].unique())
print(cases_inter_trainval, cases_inter_valtest, cases_inter_traintest)

[] [] []


In [130]:
df_train

Unnamed: 0,Case Number,File,Diagnosis
0,1,/workspace/experiments/vit/datasets/IARCImageB...,0
1,1,/workspace/experiments/vit/datasets/IARCImageB...,0
2,1,/workspace/experiments/vit/datasets/IARCImageB...,0
3,1,/workspace/experiments/vit/datasets/IARCImageB...,0
12,4,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
1308,380,/workspace/experiments/vit/datasets/IARCImageB...,0
1309,381,/workspace/experiments/vit/datasets/IARCImageB...,0
1310,381,/workspace/experiments/vit/datasets/IARCImageB...,0
1315,384,/workspace/experiments/vit/datasets/IARCImageB...,0


In [131]:
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['Case Number', 'File', 'Diagnosis']]
df_train

Unnamed: 0,Case Number,File,Diagnosis
0,1,/workspace/experiments/vit/datasets/IARCImageB...,0
1,1,/workspace/experiments/vit/datasets/IARCImageB...,0
2,1,/workspace/experiments/vit/datasets/IARCImageB...,0
3,1,/workspace/experiments/vit/datasets/IARCImageB...,0
4,4,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
745,380,/workspace/experiments/vit/datasets/IARCImageB...,0
746,381,/workspace/experiments/vit/datasets/IARCImageB...,0
747,381,/workspace/experiments/vit/datasets/IARCImageB...,0
748,384,/workspace/experiments/vit/datasets/IARCImageB...,0


In [132]:
df_val

Unnamed: 0,Case Number,File,Diagnosis
39,10,/workspace/experiments/vit/datasets/IARCImageB...,0
40,10,/workspace/experiments/vit/datasets/IARCImageB...,0
41,10,/workspace/experiments/vit/datasets/IARCImageB...,0
42,10,/workspace/experiments/vit/datasets/IARCImageB...,0
82,20,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
1302,377,/workspace/experiments/vit/datasets/IARCImageB...,0
1303,378,/workspace/experiments/vit/datasets/IARCImageB...,0
1304,378,/workspace/experiments/vit/datasets/IARCImageB...,0
1313,383,/workspace/experiments/vit/datasets/IARCImageB...,0


In [133]:
df_val = df_val.reset_index()
df_val = df_val.loc[:, ['Case Number', 'File', 'Diagnosis']]
df_val

Unnamed: 0,Case Number,File,Diagnosis
0,10,/workspace/experiments/vit/datasets/IARCImageB...,0
1,10,/workspace/experiments/vit/datasets/IARCImageB...,0
2,10,/workspace/experiments/vit/datasets/IARCImageB...,0
3,10,/workspace/experiments/vit/datasets/IARCImageB...,0
4,20,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
307,377,/workspace/experiments/vit/datasets/IARCImageB...,0
308,378,/workspace/experiments/vit/datasets/IARCImageB...,0
309,378,/workspace/experiments/vit/datasets/IARCImageB...,0
310,383,/workspace/experiments/vit/datasets/IARCImageB...,0


In [134]:
df_test

Unnamed: 0,Case Number,File,Diagnosis
4,2,/workspace/experiments/vit/datasets/IARCImageB...,0
5,2,/workspace/experiments/vit/datasets/IARCImageB...,0
6,2,/workspace/experiments/vit/datasets/IARCImageB...,0
7,2,/workspace/experiments/vit/datasets/IARCImageB...,0
8,3,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
1312,382,/workspace/experiments/vit/datasets/IARCImageB...,0
1317,385,/workspace/experiments/vit/datasets/IARCImageB...,0
1318,385,/workspace/experiments/vit/datasets/IARCImageB...,0
1319,386,/workspace/experiments/vit/datasets/IARCImageB...,0


In [135]:
df_test = df_test.reset_index()
df_test = df_test.loc[:, ['Case Number', 'File', 'Diagnosis']]
df_test

Unnamed: 0,Case Number,File,Diagnosis
0,2,/workspace/experiments/vit/datasets/IARCImageB...,0
1,2,/workspace/experiments/vit/datasets/IARCImageB...,0
2,2,/workspace/experiments/vit/datasets/IARCImageB...,0
3,2,/workspace/experiments/vit/datasets/IARCImageB...,0
4,3,/workspace/experiments/vit/datasets/IARCImageB...,0
...,...,...,...
254,382,/workspace/experiments/vit/datasets/IARCImageB...,0
255,385,/workspace/experiments/vit/datasets/IARCImageB...,0
256,385,/workspace/experiments/vit/datasets/IARCImageB...,0
257,386,/workspace/experiments/vit/datasets/IARCImageB...,0


In [136]:
prepared_data_path = '/workspace/experiments/vit/datasets/prepared_data/'

df_train.to_csv(prepared_data_path+'iarc_multiclass_general_train.csv', index=False)
df_val.to_csv(prepared_data_path+'iarc_multiclass_general_val.csv', index=False)
df_test.to_csv(prepared_data_path+'iarc_multiclass_general_test.csv', index=False)

In [137]:
df_train['Diagnosis'].unique()

array([0, 1, 2, 3])

In [138]:
df_val['Diagnosis'].unique()

array([0, 1, 2, 3])

In [139]:
df_test['Diagnosis'].unique()

array([0, 1, 2, 3])