In [1]:
import tensorflow as tf
from glob import glob
import numpy as np
import pickle
import json
import os
import random
import matplotlib.pyplot as plt

In [2]:
# only using CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:
def save_data(output_path, mydata):
    with open(output_path, 'wb') as f:
        
        pickle.dump(mydata, f)
        
def load_data(data_path):
    data = pickle.load(open(data_path, 'rb'))

    return data

def convert_binary_score(severe_score):
    
    if severe_score >= 10:
        return 1
    else:
        return 0
    
def lower_filepath(filepath):
    
    filename, fileformat = filepath.split(".")
    fileformat = fileformat.lower()
    
    converted_filepath = filename + "." + fileformat
    
    return converted_filepath
    
def shuffle_data(mydata):
    mydata = np.array(mydata)
    idx = np.arange(len(mydata))
    random.shuffle(idx)
    
    return mydata[idx]
    
def filter_available_visits(amd_record_visit):
    
    filtered_visits = []
    
    for visit in amd_record_visit:
        re_label_test = False
        re_data_test = False
        le_label_test = False
        le_data_test = False
        
        try:
            this_re_severe_score = visit["AMDSEVRE"]
            if np.isnan(this_re_severe_score):
                re_label_test = False
            else:
                re_label_test = True
        except:
            continue
        
        try:
            this_le_severe_score = visit["AMDSEVLE"]
            if np.isnan(this_le_severe_score):
                le_label_test = False
            else:
                le_label_test = True
        except:
            continue
            
        try:
            this_re_data = visit["RE_IMG"]
            if len(this_re_data) > 0:
                re_data_test = True
            else:
                re_data_test = False
        except:
            continue
            
        try:
            this_le_data = visit["LE_IMG"]
            if len(this_le_data) > 0:
                le_data_test = True
            else:
                le_data_test = False
        except:
            continue
            
        test_result = re_label_test * le_label_test * re_data_test * le_data_test
        
        if test_result == 1:
            filtered_visits.append(visit)
            
    return filtered_visits

In [4]:
def build_entire_data_list(data_dir):
    
    json_data = open(data_dir)
    amd_data = json.load(json_data)
    entire_data_list = []
    
    count_idx = 0
    count_removed_patient = 0
    for idx, record in enumerate(amd_data):
        if idx % 100 == 0:
            print("{} patients processed...".format(count_idx*100))
            count_idx += 1
        
        this_visits = record["VISITS"]
        filtered_visits = filter_available_visits(this_visits)
        
        if len(filtered_visits) == 0:
            count_removed_patient += 1
            continue
        
        for i, visit in enumerate(filtered_visits):
            
            entire_data_list.append(lower_filepath(visit["RE_IMG"]))
            entire_data_list.append(lower_filepath(visit["LE_IMG"]))
    
    print("{} patients that have no valid visits were excluded".format(count_removed_patient))
            
    return entire_data_list

In [5]:
entire_data_list = build_entire_data_list("/home/jl5307/current_research/AMD_prediction/data/AREDS_participants_amd3.json")

0 patients processed...
100 patients processed...
200 patients processed...
300 patients processed...
400 patients processed...
500 patients processed...
600 patients processed...
700 patients processed...
800 patients processed...
900 patients processed...
1000 patients processed...
1100 patients processed...
1200 patients processed...
1300 patients processed...
1400 patients processed...
1500 patients processed...
1600 patients processed...
1700 patients processed...
1800 patients processed...
1900 patients processed...
2000 patients processed...
2100 patients processed...
2200 patients processed...
2300 patients processed...
2400 patients processed...
2500 patients processed...
2600 patients processed...
2700 patients processed...
2800 patients processed...
2900 patients processed...
3000 patients processed...
3100 patients processed...
3200 patients processed...
3300 patients processed...
3400 patients processed...
3500 patients processed...
3600 patients processed...
3700 patients

In [7]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_dict/entire_data_list.pkl", entire_data_list)