In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, cohen_kappa_score, precision_score, recall_score

from pathlib import Path
this_path = Path().resolve()

In [2]:
data = pd.read_csv(this_path.parent / 'radiomics_features.csv')
data.columns

Index(['sample_id', 'img_name', 'label_name', 'label',
       'original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_Maximum3DDiameter', 'original_shape_MeshVolume',
       'original_shape_MinorAxisLength', 'original_shape_Sphericity',
       'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio',
       'original_shape_VoxelVolume', 'original_firstorder_10Percentile',
       'original_firstorder_90Percentile', 'original_firstorder_Energy',
       'original_firstorder_Entropy', 'original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis', 'original_firstorder_Maximum',
       'original_firstorder_MeanAbsoluteDeviation', 'original_firstorder_Mean',
       'original_firstorder_Median', 'original_firstorder_Minimum',
    

In [3]:
# Load the data
for feat_type in ['shape', 'glcm', 'firstorder']:
    print(f'\n -------------{feat_type} --------- features \n')
    # feat_type = 'shape' # glcm, firstorder
    data = pd.read_csv(this_path.parent / 'radiomics_features.csv')
    X = data.drop(columns=['sample_id', 'img_name', 'label_name', 'label'])
    filter_col = [col for col in X.columns if feat_type in col]
    print(f'Number of features: {len(filter_col)}')
    X = X[filter_col]
    # binarize the labels
    data['label'] = data['label'].map({'normal': 0,
                                    'warp': 1,
                                    'sphere_water': 1,
                                    'sphere_mean': 1,})
    y = data['label']
    print(y.value_counts())
    print(f'Data shape: {X.shape, y.shape}')

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=500,
                                              random_state=42,
                                              class_weight='balanced'))
        # ('classifier', svm.SVC(kernel='linear', C=1, random_state=42))
    ])

    # 5fold cross validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_validate(pipeline, X, y, cv=kf, 
                            scoring={
                                'accuracy': make_scorer(accuracy_score),
                                'f1': make_scorer(f1_score, average='weighted'),
                                'cohen_kappa': make_scorer(cohen_kappa_score),
                                'precision': make_scorer(precision_score, average='weighted'),
                                'recall': make_scorer(recall_score, average='weighted')
                            }, n_jobs=16)
    print(f'\n Metrics 5-fold crossv: \n')
    # Calculate and print the mean and standard deviation for each metric
    for metric in scores.keys():
        if metric.startswith('test_'):  # Filter out the test scores
            mean_score = np.mean(scores[metric])
            std_score = np.std(scores[metric])
            print(f'{metric[5:]}: {mean_score:.2f} (+/- {std_score:.2f})')


 -------------shape --------- features 

Number of features: 14
label
1    1638
0     546
Name: count, dtype: int64
Data shape: ((2184, 14), (2184,))

 Metrics 5-fold crossv: 

accuracy: 0.82 (+/- 0.02)
f1: 0.81 (+/- 0.03)
cohen_kappa: 0.46 (+/- 0.05)
precision: 0.81 (+/- 0.03)
recall: 0.82 (+/- 0.02)

 -------------glcm --------- features 

Number of features: 22
label
1    1638
0     546
Name: count, dtype: int64
Data shape: ((2184, 22), (2184,))

 Metrics 5-fold crossv: 

accuracy: 0.71 (+/- 0.03)
f1: 0.63 (+/- 0.05)
cohen_kappa: -0.06 (+/- 0.02)
precision: 0.57 (+/- 0.06)
recall: 0.71 (+/- 0.03)

 -------------firstorder --------- features 

Number of features: 18
label
1    1638
0     546
Name: count, dtype: int64
Data shape: ((2184, 18), (2184,))

 Metrics 5-fold crossv: 

accuracy: 0.71 (+/- 0.03)
f1: 0.63 (+/- 0.05)
cohen_kappa: -0.06 (+/- 0.02)
precision: 0.57 (+/- 0.06)
recall: 0.71 (+/- 0.03)


In [38]:
# feat_type = 'shape' # glcm, firstorder
data = pd.read_csv(this_path.parent / 'radiomics_features.csv')
X = data.drop(columns=['sample_id', 'img_name', 'label_name', 'label'])
print(f'Number of features: {len(X.columns)}')
# binarize the labels
data['label'] = data['label'].map({'normal': 0,
                                'warp': 1,
                                'sphere_water': 1,
                                'sphere_mean': 1,})
y = data['label']
print(y.value_counts())
print(f'Data shape: {X.shape, y.shape}')

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=500,
                                              random_state=42,
                                              class_weight='balanced'))
    # ('classifier', svm.SVC(kernel='linear', C=1, random_state=42))
])

# 5fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_validate(pipeline, X, y, cv=kf, 
                        scoring={
                            'accuracy': make_scorer(accuracy_score),
                            'f1': make_scorer(f1_score, average='weighted'),
                            'cohen_kappa': make_scorer(cohen_kappa_score),
                            'precision': make_scorer(precision_score, average='weighted'),
                            'recall': make_scorer(recall_score, average='weighted')
                        }, n_jobs=16)
print(f'\n Metrics 5-fold crossv: \n')
# Calculate and print the mean and standard deviation for each metric
for metric in scores.keys():
    if metric.startswith('test_'):  # Filter out the test scores
        mean_score = np.mean(scores[metric])
        std_score = np.std(scores[metric])
        print(f'{metric[5:]}: {mean_score:.2f} (+/- {std_score:.2f})')

Number of features: 54
label
1    1638
0     546
Name: count, dtype: int64
Data shape: ((2184, 54), (2184,))

 Metrics 5-fold crossv: 

accuracy: 0.79 (+/- 0.03)
f1: 0.76 (+/- 0.04)
cohen_kappa: 0.33 (+/- 0.07)
precision: 0.77 (+/- 0.03)
recall: 0.79 (+/- 0.03)


In [7]:
# Now let´s train the model with all the data

data = pd.read_csv(this_path.parent / 'radiomics_features.csv')
X = data.drop(columns=['sample_id', 'img_name', 'label_name', 'label'])
filter_col = [col for col in X.columns if 'shape' in col]
print(f'Number of features: {len(filter_col)}')
X = X[filter_col]
# binarize the labels
data['label'] = data['label'].map({'normal': 0,
                                'warp': 1,
                                'sphere_water': 1,
                                'sphere_mean': 1,})
y = data['label']
print(y.value_counts())
print(f'Data shape: {X.shape, y.shape}')

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=500,
                                            random_state=42,
                                            class_weight='balanced'))
])

pipeline.fit(X, y)
# import joblib
# joblib.dump(pipeline, 'rf_shape_model.pkl')
# print('Model saved to disk')

# and perform the prediction with the test data
data = pd.read_csv(this_path.parent / 'test_radiomics_features.csv')
X = data.drop(columns=['sample_id', 'img_name', 'label_name'])
filter_col = [col for col in X.columns if 'shape' in col]
print(f'Number of features: {len(filter_col)}')
X_test = X[filter_col]
y_test = pipeline.predict(X_test)
# save the results
results_df = data[['sample_id', 'img_name']]
results_df['pred_label'] = y_test
print(results_df['pred_label'].value_counts())
# results.to_csv(this_path.parent / 'radiomics_results/results_shape.csv', index=False)


Number of features: 14
label
1    1638
0     546
Name: count, dtype: int64
Data shape: ((2184, 14), (2184,))
Number of features: 14
pred_label
1    365
0    182
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['pred_label'] = y_test


In [8]:
results_df

Unnamed: 0,sample_id,img_name,pred_label
0,sample_0538,sample_0538_crop.nii.gz,0
1,sample_0824,sample_0824_crop.nii.gz,1
2,sample_0813,sample_0813_crop.nii.gz,1
3,sample_0693,sample_0693_crop.nii.gz,1
4,sample_0615,sample_0615_crop.nii.gz,1
...,...,...,...
542,sample_0277,sample_0277_crop.nii.gz,0
543,sample_0566,sample_0566_crop.nii.gz,0
544,sample_0458,sample_0458_crop.nii.gz,1
545,sample_0765,sample_0765_crop.nii.gz,1


In [6]:
test_200 = pd.read_csv('/home/alejandrocu/OutlierDetectionChallenge2024/challenge_results/test_files_200.txt', header=None)
test_set_path = Path("/media/7tb_encrypted/od_chall/dataset/challenge_data/test")
# (test_set_path/'crop').exists()
test_set_files = list((test_set_path/'crops').glob('*crop.nii.gz'))
test_set_files = [f.name.split('_crop')[0] for f in test_set_files]
test_names = test_200[0].str.strip().to_list()
print(len(test_set_files))

547


In [10]:
i = 0
results = []
for row in results_df.iterrows():
    sample_id = row[1]['sample_id']
    # print(sample_id)
    # print(row[1]['Predicted'])
    if sample_id in test_names:
        i += 1
        results.append({'scan_id': sample_id, 'outlier': row[1]['pred_label']})
        # test_names.remove(sample_id)
print(i)
print(len(results))

200
200


In [11]:
import json
# Write results to JSON file
with open(this_path.parent / 'radiomics_results/test_results.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)