# SIIM-FISABIO-RSNA COVID-19 Detection Opacity Submission
Team ML Ballers

## Resources:
- Competition Page: https://www.kaggle.com/c/siim-covid19-detection
- Dataset Info: https://arxiv.org/pdf/2006.01174.pdf
- Annotation Info: https://journals.lww.com/thoracicimaging/Fulltext/2020/11000/Review_of_Chest_Radiograph_Findings_of_COVID_19.4.aspx
- Emsembling methods: https://towardsdatascience.com/ensemble-methods-in-machine-learning-what-are-they-and-why-use-them-68ec3f9fef5f
- What everyone else is doing: https://www.kaggle.com/pvtien96/siim-cov19-efnb7-yolov5-infer
- Interesting ideas: https://www.kaggle.com/davidbroberts/lung-segmentation-without-cnn

In [None]:
import os
import numpy as np
import pandas as pd
import json
import pydicom as dicom
import matplotlib.pylab as plt
import matplotlib.patches as patches
from sklearn.model_selection import train_test_split
from PIL import Image
import time

## Getting Training Data

In [None]:
train_image_level = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
train_image_level

In [None]:
train_study_level = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
train_study_level

In [None]:
train_study_level.loc[train_study_level['Typical Appearance'] == 1, 'Class'] = 'typical'
train_study_level.loc[train_study_level['Negative for Pneumonia'] == 1, 'Class'] = 'negative'
train_study_level.loc[train_study_level['Indeterminate Appearance'] == 1, 'Class'] = 'indeterminate'
train_study_level.loc[train_study_level['Atypical Appearance'] == 1, 'Class'] = 'atypical'

train_study_level['StudyInstanceUID'] = train_study_level['id'].str[:-6]

train_study_level

In [None]:
train_image_level = train_image_level.merge(train_study_level[['StudyInstanceUID', 'Class']], on='StudyInstanceUID')

train_image_level

In [None]:
# Remove StudyInstanceUID Duplicates
train_image_level = train_image_level.sort_values(by=['boxes'])
train_image_level = train_image_level.drop_duplicates(subset=['StudyInstanceUID'])
train_image_level = train_image_level.sample(frac=1).reset_index(drop=True)

train_image_level

In [None]:
# === Store Image dimensions (height, width) ===
train_image_level['image_height'] = 1
train_image_level['image_width'] = 1

training_image_directory = '../input/siim-covid19-detection/train/'

count = 0

for subdir, dirs, files in os.walk(training_image_directory):
    for filename in files:
        filepath = subdir + os.sep + filename

        if filepath.endswith(".dcm"):
            try:
                train_image_level.loc[train_image_level['id'] == f'{filepath.split("/")[-1][:-4]}_image', 'image_height'] = dicom.dcmread(filepath).Rows
                train_image_level.loc[train_image_level['id'] == f'{filepath.split("/")[-1][:-4]}_image', 'image_width'] = dicom.dcmread(filepath).Columns
            
            except Exception as e:
                pass
        count += 1
        print(f'{count} out of {len(train_image_level)}', end='\r')

train_image_level

## Get Test Data

In [None]:
#move here
submission_df = pd.DataFrame(columns=['id','PredictionString'])

In [None]:
!mkdir opacities
!mkdir opacities/images
!mkdir opacities/images/test

In [None]:
IMAGE_SIZE = 640

def save_jpg_from_dicom(dicom_img, path, img_id):
    data = dicom.pixel_data_handlers.util.apply_voi_lut(dicom_img.pixel_array, dicom_img)
    
    if dicom_img.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data -= np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    image = Image.fromarray(data)
    
    image = image.resize((IMAGE_SIZE, IMAGE_SIZE))
    
    outfile = f"{path}im{img_id}.jpg"
    try:
        image.save(outfile)
    except Exception as e:
        print(e)
        print("cannot convert")

In [None]:
test_image_directory = '../input/siim-covid19-detection/test/'
num_test_images = len(os.listdir(test_image_directory))
count = 0
opacities_directory = 'opacities/'
data_step = 'test'

test_image_level_list = []

for subdir, dirs, files in os.walk(test_image_directory):
    for filename in files:
        filepath = subdir + os.sep + filename

        if filepath.endswith(".dcm"):
            try:
                filepath_list = filepath.split('/')
                dicom_file = dicom.dcmread(filepath)
                
                test_image_level_list.append([f'{filepath_list[-1][:-4]}_image', f'{filepath_list[-3]}_study',
                                             dicom_file.Rows, dicom_file.Columns])
                
                save_jpg_from_dicom(dicom.dcmread(filepath), f'{opacities_directory}images/{data_step}/', f'{filepath_list[-1][:-4]}_image')
                
                count += 1
            except Exception as e:
                opacity_prediction = 'none 1 0 0 1 1'
                study_prediction = 'negative 1 0 0 1 1'

                submission_df = submission_df.append({'id': f"{filepath_list[-1][:-4]}_image",'PredictionString': opacity_prediction}, ignore_index=True)
                submission_df = submission_df.append({'id': f"{filepath_list[-3]}_study",'PredictionString': study_prediction}, ignore_index=True)
                
                count += 1
        
        print(f'{count} out of {num_test_images}', end='\r')
test_image_level = pd.DataFrame(test_image_level_list, columns=['image_id', 'study_id', 'image_height', 'image_width'])

In [None]:
len(os.listdir(test_image_directory))

In [None]:
submission_df

In [None]:
num_test_images = len(test_image_level_list)

## Localization

The weights in the command below come from a seperate notebook that is not shown in this repository. Simply link your own weights below if you wish to use this code.

In [None]:
!python ../input/covid-detection-weights/detect.py --weights ../input/covid-detection-weights/weights.pt --img 640 --source opacities/images/test --save-txt --save-conf --conf-thres 0.15

## Classification

In [None]:
def normalize_bounding_box(box, image_shape):
    width = int(box['width']) / image_shape[1]
    height = int(box['height']) / image_shape[0]
    center_x = (int(box['x']) / image_shape[1]) + width
    center_y = (int(box['y']) / image_shape[0]) + height

    return np.array([center_x, center_y, width, height])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

def get_tensors(boxes, dimensions, tensor_dims=(4, 4)):
    if str(boxes) == 'nan':
        return np.zeros(tensor_dims[0] * tensor_dims[1])
    boxes = str(boxes).replace("'", '"')
    boxes = json.loads(boxes)
    normalized_boxes = [normalize_bounding_box(box, dimensions) for box in boxes]
    
    tensor_without_zeros = np.array(normalized_boxes)[:tensor_dims[0]]
#     naked = np.array([list(box.values())[:2] for box in boxes])[:tensor_dims[0]]
    
    result = np.zeros(tensor_dims)
    
    result[:tensor_without_zeros.shape[0],:tensor_without_zeros.shape[1]] = tensor_without_zeros
    
    result = np.array(sorted(result,key=lambda l:l[0], reverse=True))
    
    return np.ravel(result)
    

X = np.array([get_tensors(boxes, (height, width)) for boxes, height, width in zip(list(train_image_level['boxes']), list(train_image_level['image_height']), list(train_image_level['image_width']))])

print(X)

le = preprocessing.LabelEncoder()
le.fit(['atypical', 'indeterminate', 'negative', 'typical'])
y = le.transform(list(train_image_level['Class']))

rf = RandomForestClassifier()
rf.fit(X, y)
# print(f"Random Forest Accuracy: {rf.score(X_valid, y_valid)}")
# print(f"Random Forest AUC: {roc_auc_score(y_valid, rf.predict_proba(X_valid), multi_class='ovr')}")

def get_tensors_clean(boxes, tensor_dims=(4, 4)):
    tensor_without_zeros = np.array(boxes)[:tensor_dims[0]]
    
    result = np.zeros(tensor_dims)
    
    result[:tensor_without_zeros.shape[0],:tensor_without_zeros.shape[1]] = tensor_without_zeros
    
    result = np.array(sorted(result,key=lambda l:l[0], reverse=True))
    
    return np.ravel(result)

def get_study_predicition(bounding_boxes):
    prediction = rf.predict_proba([get_tensors_clean(bounding_boxes)])[0]
    
#     print(prediction)
    
    max_index = np.argmax(prediction)
    
    prediction_string = le.inverse_transform([max_index])[0]
    prediction_probability = round(prediction[max_index], 3)
    
    return f'{prediction_string} {prediction_probability} 0 0 1 1'

In [None]:
# submission_df = pd.DataFrame(columns=['id','PredictionString'])

test_dir = 'opacities/images/test'
detections_dir = 'runs/detect/exp/labels'
# runs/detect/exp15/labels/im7c40e04c6163_image.txt

detections_file_names = [file[2:-4] for file in os.listdir(detections_dir)]
print(detections_file_names)

for subdir, dirs, files in os.walk(test_dir):
    for filename in files:
        filepath = subdir + os.sep + filename
        
        if filepath.endswith(".jpg"):
            row = test_image_level.loc[test_image_level['image_id'] == filepath.split('/')[-1][2:-4]]
            
            # === prediction ===
            
            opacity_prediction = ''
            study_prediction = ''
            
            if row['image_id'].values[0] in detections_file_names:
                bounding_boxes = []
                
                with open(f'{detections_dir}/im{row["image_id"].values[0]}.txt','r') as file:
                    for line in file:
                        detection = [float(val) for val in line.split()]
                        
                        # 0 0.932031 0.164062 0.135937 0.328125 0.100465
                        image_width = row['image_width'].values[0]
                        image_height = row['image_height'].values[0]
                        
                        x_min = round((detection[1] - (detection[3] / 2)) * image_width, 3)
                        x_max = round((detection[2] - (detection[4] / 2)) * image_width, 3)
                        y_min = round((detection[1] + (detection[3] / 2)) * image_height, 3)
                        y_max = round((detection[2] + (detection[4] / 2)) * image_height, 3)
                        
                        
                        opacity_prediction = f'{opacity_prediction}opacity {round(detection[-1], 2)} {x_min} {y_min} {x_max} {y_max} '
                        
                        bounding_boxes.append(detection[1:4])
                    
                    study_prediction = get_study_predicition(bounding_boxes)
                        
            else:
                opacity_prediction = 'none 1 0 0 1 1'
                study_prediction = 'negative 1 0 0 1 1'
            
            submission_df = submission_df.append({'id': row['image_id'].values[0],'PredictionString': opacity_prediction}, ignore_index=True)
            submission_df = submission_df.append({'id': f"{row['study_id'].values[0]}",'PredictionString': study_prediction}, ignore_index=True)

## Submission

In [None]:
submission_df = submission_df.sort_values(by=['PredictionString'])
submission_df = submission_df.drop_duplicates(subset=['id'])

submit_studies = submission_df.loc[submission_df['id'].str.contains('study')]
submit_images = submission_df.loc[submission_df['id'].str.contains('image')]

In [None]:
sample_submission = pd.read_csv('../input/siim-covid19-detection/sample_submission.csv')

sample_submission

sample_studies = sample_submission.loc[sample_submission['id'].str.contains('study')]
sample_images = sample_submission.loc[sample_submission['id'].str.contains('image')]

In [None]:
#removing duplicates
sample_submission = sample_submission.drop_duplicates(subset=['id'])
sample_studies = sample_submission.loc[sample_submission['id'].str.contains('study')]
sample_images = sample_submission.loc[sample_submission['id'].str.contains('image')]

In [None]:
submission_df.to_csv('submission.csv', index=False)