In [1]:
import tensorflow as tf
from glob import glob
import numpy as np
import pickle
import json
import os
import random

In [2]:
# only using CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:
def save_data(output_path, mydata):
    with open(output_path, 'wb') as f:
        
        pickle.dump(mydata, f)
        
def load_data(data_path):
    data = pickle.load(open(data_path, 'rb'))

    return data

def convert_binary_score(severe_score):
    
    if severe_score >= 10:
        return 1
    else:
        return 0
    
def filter_available_visits(amd_record_visit):
    
    filtered_visits = []
    
    for visit in amd_record_visit:
        re_label_test = False
        re_data_test = False
        le_label_test = False
        le_data_test = False
        
        try:
            this_re_severe_score = visit["AMDSEVRE"]
            if np.isnan(this_re_severe_score):
                re_label_test = False
            else:
                re_label_test = True
        except:
            continue
        
        try:
            this_le_severe_score = visit["AMDSEVLE"]
            if np.isnan(this_le_severe_score):
                le_label_test = False
            else:
                le_label_test = True
        except:
            continue
            
        try:
            this_re_data = visit["RE_IMG"]
            if len(this_re_data) > 0:
                re_data_test = True
            else:
                re_data_test = False
        except:
            continue
            
        try:
            this_le_data = visit["LE_IMG"]
            if len(this_le_data) > 0:
                le_data_test = True
            else:
                le_data_test = False
        except:
            continue
            
        test_result = re_label_test * le_label_test * re_data_test * le_data_test
        
        if test_result == 1:
            filtered_visits.append(visit)
            
    return filtered_visits

def build_eye_img_mapping_dict(data_dir, return_count=True):
    
    json_data = open(data_dir)
    amd_data = json.load(json_data)
    eye_img_binary_mapping_dict = dict()
    eye_img_score_mapping_dict = dict()
    binary_count = {0 : 0, 1 : 0}
    score_count = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}
    
    count_idx = 1
    for idx, record in enumerate(amd_data):
        if idx % 100 == 0:
            print("{} patients processed...".format(count_idx*100))
            count_idx += 1
    
        this_visits = record["VISITS"]
        for visit in this_visits:
            
            try:
                this_re_severe_score = visit["AMDSEVRE"]
                if np.isnan(this_re_severe_score):
                    continue
                else:
                    this_re_img = visit["RE_IMG"]
                    this_re_binary_score = convert_binary_score(this_re_severe_score)
                    binary_count[this_re_binary_score] += 1
                    score_count[this_re_severe_score] += 1
                    eye_img_score_mapping_dict[this_re_img] = this_re_severe_score
                    eye_img_binary_mapping_dict[this_re_img] = this_re_binary_score
            except:
                None
            
            try:
                this_le_severe_score = visit["AMDSEVLE"]
                if np.isnan(this_le_severe_score):
                    continue
                else:       
                    this_le_img = visit["LE_IMG"]
                    this_le_binary_score = convert_binary_score(this_le_severe_score)
                    binary_count[this_le_binary_score] += 1
                    score_count[this_le_severe_score] += 1
                    eye_img_score_mapping_dict[this_le_img] = this_le_severe_score
                    eye_img_binary_mapping_dict[this_le_img] = this_le_binary_score
            except:
                None
    if return_count:
        return eye_img_score_mapping_dict, eye_img_binary_mapping_dict, binary_count, score_count
    else:
        return eye_img_score_mapping_dict, eye_img_binary_mapping_dict
    
def build_eye_img_futuretimestamp_mapping_dict(data_dir, num_timestamp, return_count=True):
    
    json_data = open(data_dir)
    amd_data = json.load(json_data)
    eye_img_binary_mapping_dict = dict()
    eye_img_score_mapping_dict = dict()
    binary_count = {0 : 0, 1 : 0}
    score_count = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}
    
    count_idx = 0
    for idx, record in enumerate(amd_data):
        if idx % 100 == 0:
            print("{} patients processed...".format(count_idx*100))
            count_idx += 1
    
        this_visits = record["VISITS"]
        filtered_visits = filter_available_visits(this_visits)
        
        for i in range(len(filtered_visits)-num_timestamp):
            this_visit = filtered_visits[i]
            future_visit = filtered_visits[i+num_timestamp]

            this_re_img = this_visit["RE_IMG"]
            this_le_img = this_visit["LE_IMG"]
            this_re_severe_score = this_visit["AMDSEVRE"]
            this_le_severe_score = this_visit["AMDSEVLE"]
            future_re_severe_score = future_visit["AMDSEVRE"]
            future_le_severe_score = future_visit["AMDSEVLE"]
            
            this_re_binary_score = convert_binary_score(this_re_severe_score)
            this_le_binary_score = convert_binary_score(this_le_severe_score)
            future_re_binary_score = convert_binary_score(future_re_severe_score)
            future_le_binary_score = convert_binary_score(future_le_severe_score)
            
            # remove recurrent late AMD labels
            re_recurrent = False
            le_recurrent = False
            if this_re_binary_score == 1 and future_re_binary_score == 1:
                re_recurrent = True
            
            if this_le_binary_score == 1 and future_le_binary_score == 1:
                le_recurrent = True
            
            if re_recurrent != True:

                eye_img_score_mapping_dict[this_re_img] = future_re_severe_score
                eye_img_binary_mapping_dict[this_re_img] = future_re_binary_score
                binary_count[future_re_binary_score] += 1
                score_count[future_re_severe_score] += 1
            
            if le_recurrent != True:
                eye_img_score_mapping_dict[this_le_img] = future_le_severe_score
                eye_img_binary_mapping_dict[this_le_img] = future_le_binary_score
                binary_count[future_le_binary_score] += 1
                score_count[future_le_severe_score] += 1
            
    if return_count:
        return eye_img_score_mapping_dict, eye_img_binary_mapping_dict, binary_count, score_count
    else:
        return eye_img_score_mapping_dict, eye_img_binary_mapping_dict

In [4]:
eye_img_score_onetimestamp_mapping_dict, eye_img_binary_onetimestamp_mapping_dict, binary_count, score_count = build_eye_img_futuretimestamp_mapping_dict("/home/jl5307/current_research/AMD_prediction/data/AREDS_participants_amd3.json", 
    num_timestamp=1, return_count=True)

0 patients processed...
100 patients processed...
200 patients processed...
300 patients processed...
400 patients processed...
500 patients processed...
600 patients processed...
700 patients processed...
800 patients processed...
900 patients processed...
1000 patients processed...
1100 patients processed...
1200 patients processed...
1300 patients processed...
1400 patients processed...
1500 patients processed...
1600 patients processed...
1700 patients processed...
1800 patients processed...
1900 patients processed...
2000 patients processed...
2100 patients processed...
2200 patients processed...
2300 patients processed...
2400 patients processed...
2500 patients processed...
2600 patients processed...
2700 patients processed...
2800 patients processed...
2900 patients processed...
3000 patients processed...
3100 patients processed...
3200 patients processed...
3300 patients processed...
3400 patients processed...
3500 patients processed...
3600 patients processed...
3700 patients

In [6]:
eye_img_score_twotimestamp_mapping_dict, eye_img_binary_twotimestamp_mapping_dict, binary_count, score_count = build_eye_img_futuretimestamp_mapping_dict("/home/jl5307/current_research/AMD_prediction/data/AREDS_participants_amd3.json", 
    num_timestamp=2, return_count=True)

0 patients processed...
100 patients processed...
200 patients processed...
300 patients processed...
400 patients processed...
500 patients processed...
600 patients processed...
700 patients processed...
800 patients processed...
900 patients processed...
1000 patients processed...
1100 patients processed...
1200 patients processed...
1300 patients processed...
1400 patients processed...
1500 patients processed...
1600 patients processed...
1700 patients processed...
1800 patients processed...
1900 patients processed...
2000 patients processed...
2100 patients processed...
2200 patients processed...
2300 patients processed...
2400 patients processed...
2500 patients processed...
2600 patients processed...
2700 patients processed...
2800 patients processed...
2900 patients processed...
3000 patients processed...
3100 patients processed...
3200 patients processed...
3300 patients processed...
3400 patients processed...
3500 patients processed...
3600 patients processed...
3700 patients

In [5]:
eye_img_score_mapping_dict, eye_img_binary_mapping_dict, binary_count, score_count = build_eye_img_mapping_dict("/home/jl5307/current_research/AMD_prediction/data/AREDS_participants_amd3.json", return_count=True)

100 patients processed...
200 patients processed...
300 patients processed...
400 patients processed...
500 patients processed...
600 patients processed...
700 patients processed...
800 patients processed...
900 patients processed...
1000 patients processed...
1100 patients processed...
1200 patients processed...
1300 patients processed...
1400 patients processed...
1500 patients processed...
1600 patients processed...
1700 patients processed...
1800 patients processed...
1900 patients processed...
2000 patients processed...
2100 patients processed...
2200 patients processed...
2300 patients processed...
2400 patients processed...
2500 patients processed...
2600 patients processed...
2700 patients processed...
2800 patients processed...
2900 patients processed...
3000 patients processed...
3100 patients processed...
3200 patients processed...
3300 patients processed...
3400 patients processed...
3500 patients processed...
3600 patients processed...
3700 patients processed...
3800 patie

In [25]:
def calculate_class_weight(class_count, rounding, mode):
    num_class = len(class_count)
    total_count = 0
    count_per_class = []
    
    for key, value in class_count.items():
        count_per_class.append(value)
        total_count += value
    
    if mode == "log":
        count_per_class = np.array(count_per_class)
        p = total_count / count_per_class
        class_weight = np.round(np.log10(p), rounding)
    
    else:
        count_per_class = np.array(count_per_class)
        p = count_per_class / total_count
        class_weight = np.round((1 - p) / (num_class-1), rounding)
    
    return class_weight

In [26]:
binary_count

{0: 49242, 1: 1086}

In [27]:
calculate_class_weight(binary_count, 3, mode="typical")

array([0.022, 0.978])

In [28]:
calculate_class_weight(score_count, 4, mode="log")

array([0.3814, 0.8792, 1.2485, 0.9983, 1.2027, 1.1177, 1.2185, 1.2482,
       1.7418, 2.2625, 1.8153, 3.089 ])

In [29]:
score_count

{1: 20911,
 2: 6647,
 3: 2840,
 4: 5053,
 5: 3156,
 6: 3838,
 7: 3043,
 8: 2842,
 9: 912,
 10: 275,
 11: 770,
 12: 41}

In [33]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/eye_img_score_mapping_dict.pkl", eye_img_score_mapping_dict)

In [34]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/eye_img_binary_mapping_dict.pkl", eye_img_binary_mapping_dict)

In [30]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/eye_img_score_onetimestamp_mapping_dict.pkl", eye_img_score_onetimestamp_mapping_dict)

In [31]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/eye_img_binary_onetimestamp_mapping_dict.pkl", eye_img_binary_onetimestamp_mapping_dict)

In [8]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/eye_img_binary_twotimestamp_mapping_dict.pkl", eye_img_binary_twotimestamp_mapping_dict)

In [9]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/eye_img_score_twotimestamp_mapping_dict.pkl", eye_img_score_twotimestamp_mapping_dict)

In [10]:
# build image mapping data dictionary

In [11]:
def get_filename(img_file_root_path):
    
    globpath = img_file_root_path + "/*"
    file_list = []

    for filedir in glob(globpath):
        current_dir = filedir + "/*"
        for imgfile in glob(current_dir):
            file_list.append(imgfile)
    
    return file_list

def shuffle_data(mydata):
    mydata = np.array(mydata)
    idx = np.arange(len(mydata))
    random.shuffle(idx)
    
    return mydata[idx]

def build_img_data_dict(img_file_root_path, eye_mapping_dict_path, train_size, validation_size, test_size, 
                           is_binary_feature=True):
    
    file_list = get_filename(img_file_root_path)
    eye_feature_dict = load_data(eye_mapping_dict_path)
    print("{} img files identified from the given root path...".format(len(file_list)))
    
    file_has_feature = set(eye_feature_dict.keys())

    count_idx = 0
    non_late_amd_eyes = []
    late_amd_eyes = []
    if is_binary_feature:
        label_threshold = 1
    else:
        label_threshold = 10
    
    for idx, file in enumerate(file_list):
        if idx % 1000 == 0:
            print("{} processed...".format(count_idx*1000))
            count_idx += 1
        
        file_name = file.split("/")[-1]
        
        if file_name in file_has_feature:
            
            if eye_feature_dict[file_name] >= label_threshold:
                late_amd_eyes.append(file_name)
            else:
                non_late_amd_eyes.append(file_name)
    
    late_amd_eyes = shuffle_data(late_amd_eyes)
    non_late_amd_eyes = shuffle_data(non_late_amd_eyes)
    
    # train, validation, test set 
    train_late_amd_eyes = late_amd_eyes[:int(np.floor(len(late_amd_eyes)*train_size))]
    train_non_late_amd_eyes = non_late_amd_eyes[:int(np.floor(len(non_late_amd_eyes)*train_size))]
    
    test_validation_late_amd_eyes = late_amd_eyes[int(np.floor(len(late_amd_eyes)*train_size)):]
    test_validation_non_late_amd_eyes = non_late_amd_eyes[int(np.floor(len(non_late_amd_eyes)*train_size)):]
    
    len(test_validation_late_amd_eyes) * (test_size / (validation_size+test_size))
    
    test_late_amd_eyes = test_validation_late_amd_eyes[:int(np.floor(len(test_validation_late_amd_eyes) * (test_size / (validation_size+test_size))))]
    test_non_late_amd_eyes = test_validation_non_late_amd_eyes[:int(np.floor(len(test_validation_non_late_amd_eyes) * (test_size / (validation_size+test_size))))]
    
    validation_late_amd_eyes = test_validation_late_amd_eyes[int(np.floor(len(test_validation_late_amd_eyes) * (test_size / (validation_size+test_size)))):]
    validation_non_late_amd_eyes = test_validation_non_late_amd_eyes[int(np.floor(len(test_validation_non_late_amd_eyes) * (test_size / (validation_size+test_size)))):]
    
    train_amd_eyes = []
    validation_amd_eyes = []
    test_amd_eyes = []
    
    train_amd_eyes.extend(train_late_amd_eyes)
    train_amd_eyes.extend(train_non_late_amd_eyes)
    validation_amd_eyes.extend(validation_late_amd_eyes)
    validation_amd_eyes.extend(validation_non_late_amd_eyes)
    test_amd_eyes.extend(test_late_amd_eyes)
    test_amd_eyes.extend(test_non_late_amd_eyes)
    
    train_amd_eyes = shuffle_data(train_amd_eyes)
    validation_amd_eyes = shuffle_data(validation_amd_eyes)
    test_amd_eyes = shuffle_data(test_amd_eyes)
    
    train_amd_eyes_dict = dict()
    validation_amd_eyes_dict = dict()
    test_amd_eyes_dict = dict()
    
    for eye in train_amd_eyes:
        train_amd_eyes_dict[eye] = eye_feature_dict[eye]
        
    for eye in validation_amd_eyes:
        validation_amd_eyes_dict[eye] = eye_feature_dict[eye]
        
    for eye in test_amd_eyes:
        test_amd_eyes_dict[eye] = eye_feature_dict[eye]

    return {"train_set" : train_amd_eyes_dict, "validation_set" : validation_amd_eyes_dict, "test_set" : test_amd_eyes_dict}

In [12]:
def count_class(img_data_dict, class_type):
    """check each class is evenly included in the given data dictionary"""
    
    if class_type == "binary":
        train_set_count = {0 : 0, 1 : 1}
        validation_set_count = {0 : 0, 1 : 1}
        test_set_count = {0 : 0, 1 : 1}
    else:
        train_set_count = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}
        validation_set_count = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}
        test_set_count = {1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0, 10:0, 11:0, 12:0}
    
    for key, value in img_data_dict["train_set"].items():
        train_set_count[value] += 1
        
    for key, value in img_data_dict["validation_set"].items():
        validation_set_count[value] += 1
        
    for key, value in img_data_dict["test_set"].items():
        test_set_count[value] += 1
        
    return train_set_count, validation_set_count, test_set_count

In [12]:
img_data_binary_dict = build_img_data_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                       "/home/jl5307/current_research/AMD_prediction/img_data/eye_img_binary_mapping_dict.pkl", 
                        train_size=0.7, validation_size=0.1, test_size=0.2, is_binary_feature=True)

87854 img files identified from the given root path...
0 processed...
1000 processed...
2000 processed...
3000 processed...
4000 processed...
5000 processed...
6000 processed...
7000 processed...
8000 processed...
9000 processed...
10000 processed...
11000 processed...
12000 processed...
13000 processed...
14000 processed...
15000 processed...
16000 processed...
17000 processed...
18000 processed...
19000 processed...
20000 processed...
21000 processed...
22000 processed...
23000 processed...
24000 processed...
25000 processed...
26000 processed...
27000 processed...
28000 processed...
29000 processed...
30000 processed...
31000 processed...
32000 processed...
33000 processed...
34000 processed...
35000 processed...
36000 processed...
37000 processed...
38000 processed...
39000 processed...
40000 processed...
41000 processed...
42000 processed...
43000 processed...
44000 processed...
45000 processed...
46000 processed...
47000 processed...
48000 processed...
49000 processed...
50000 pr

In [14]:
img_data_score_dict = build_img_data_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                       "/home/jl5307/current_research/AMD_prediction/img_data/eye_img_score_mapping_dict.pkl", 
                        train_size=0.7, validation_size=0.1, test_size=0.2, is_binary_feature=False)

87854 img files identified from the given root path...
0 processed...
1000 processed...
2000 processed...
3000 processed...
4000 processed...
5000 processed...
6000 processed...
7000 processed...
8000 processed...
9000 processed...
10000 processed...
11000 processed...
12000 processed...
13000 processed...
14000 processed...
15000 processed...
16000 processed...
17000 processed...
18000 processed...
19000 processed...
20000 processed...
21000 processed...
22000 processed...
23000 processed...
24000 processed...
25000 processed...
26000 processed...
27000 processed...
28000 processed...
29000 processed...
30000 processed...
31000 processed...
32000 processed...
33000 processed...
34000 processed...
35000 processed...
36000 processed...
37000 processed...
38000 processed...
39000 processed...
40000 processed...
41000 processed...
42000 processed...
43000 processed...
44000 processed...
45000 processed...
46000 processed...
47000 processed...
48000 processed...
49000 processed...
50000 pr

In [15]:
img_data_binary_onetimestamp_dict = build_img_data_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                       "/home/jl5307/current_research/AMD_prediction/img_data/eye_img_binary_onetimestamp_mapping_dict.pkl", 
                        train_size=0.7, validation_size=0.1, test_size=0.2, is_binary_feature=True)

87854 img files identified from the given root path...
0 processed...
1000 processed...
2000 processed...
3000 processed...
4000 processed...
5000 processed...
6000 processed...
7000 processed...
8000 processed...
9000 processed...
10000 processed...
11000 processed...
12000 processed...
13000 processed...
14000 processed...
15000 processed...
16000 processed...
17000 processed...
18000 processed...
19000 processed...
20000 processed...
21000 processed...
22000 processed...
23000 processed...
24000 processed...
25000 processed...
26000 processed...
27000 processed...
28000 processed...
29000 processed...
30000 processed...
31000 processed...
32000 processed...
33000 processed...
34000 processed...
35000 processed...
36000 processed...
37000 processed...
38000 processed...
39000 processed...
40000 processed...
41000 processed...
42000 processed...
43000 processed...
44000 processed...
45000 processed...
46000 processed...
47000 processed...
48000 processed...
49000 processed...
50000 pr

In [16]:
img_data_score_onetimestamp_dict = build_img_data_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                       "/home/jl5307/current_research/AMD_prediction/img_data/eye_img_score_onetimestamp_mapping_dict.pkl", 
                        train_size=0.7, validation_size=0.1, test_size=0.2, is_binary_feature=False)

87854 img files identified from the given root path...
0 processed...
1000 processed...
2000 processed...
3000 processed...
4000 processed...
5000 processed...
6000 processed...
7000 processed...
8000 processed...
9000 processed...
10000 processed...
11000 processed...
12000 processed...
13000 processed...
14000 processed...
15000 processed...
16000 processed...
17000 processed...
18000 processed...
19000 processed...
20000 processed...
21000 processed...
22000 processed...
23000 processed...
24000 processed...
25000 processed...
26000 processed...
27000 processed...
28000 processed...
29000 processed...
30000 processed...
31000 processed...
32000 processed...
33000 processed...
34000 processed...
35000 processed...
36000 processed...
37000 processed...
38000 processed...
39000 processed...
40000 processed...
41000 processed...
42000 processed...
43000 processed...
44000 processed...
45000 processed...
46000 processed...
47000 processed...
48000 processed...
49000 processed...
50000 pr

In [13]:
img_data_binary_twotimestamp_dict = build_img_data_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                       "/home/jl5307/current_research/AMD_prediction/img_data/eye_img_binary_twotimestamp_mapping_dict.pkl", 
                        train_size=0.7, validation_size=0.1, test_size=0.2, is_binary_feature=True)

87854 img files identified from the given root path...
0 processed...
1000 processed...
2000 processed...
3000 processed...
4000 processed...
5000 processed...
6000 processed...
7000 processed...
8000 processed...
9000 processed...
10000 processed...
11000 processed...
12000 processed...
13000 processed...
14000 processed...
15000 processed...
16000 processed...
17000 processed...
18000 processed...
19000 processed...
20000 processed...
21000 processed...
22000 processed...
23000 processed...
24000 processed...
25000 processed...
26000 processed...
27000 processed...
28000 processed...
29000 processed...
30000 processed...
31000 processed...
32000 processed...
33000 processed...
34000 processed...
35000 processed...
36000 processed...
37000 processed...
38000 processed...
39000 processed...
40000 processed...
41000 processed...
42000 processed...
43000 processed...
44000 processed...
45000 processed...
46000 processed...
47000 processed...
48000 processed...
49000 processed...
50000 pr

In [14]:
img_data_score_twotimestamp_dict = build_img_data_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                       "/home/jl5307/current_research/AMD_prediction/img_data/eye_img_score_twotimestamp_mapping_dict.pkl", 
                        train_size=0.7, validation_size=0.1, test_size=0.2, is_binary_feature=False)

87854 img files identified from the given root path...
0 processed...
1000 processed...
2000 processed...
3000 processed...
4000 processed...
5000 processed...
6000 processed...
7000 processed...
8000 processed...
9000 processed...
10000 processed...
11000 processed...
12000 processed...
13000 processed...
14000 processed...
15000 processed...
16000 processed...
17000 processed...
18000 processed...
19000 processed...
20000 processed...
21000 processed...
22000 processed...
23000 processed...
24000 processed...
25000 processed...
26000 processed...
27000 processed...
28000 processed...
29000 processed...
30000 processed...
31000 processed...
32000 processed...
33000 processed...
34000 processed...
35000 processed...
36000 processed...
37000 processed...
38000 processed...
39000 processed...
40000 processed...
41000 processed...
42000 processed...
43000 processed...
44000 processed...
45000 processed...
46000 processed...
47000 processed...
48000 processed...
49000 processed...
50000 pr

In [44]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_binary_dict.pkl", img_data_binary_dict)

In [45]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_score_dict.pkl", img_data_score_dict)

In [27]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_binary_onetimestamp_dict.pkl", img_data_binary_onetimestamp_dict)

In [28]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_score_onetimestamp_dict.pkl", img_data_score_onetimestamp_dict)

In [15]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_binary_twotimestamp_dict.pkl", img_data_binary_twotimestamp_dict)

In [16]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_score_twotimestamp_dict.pkl", img_data_score_twotimestamp_dict)

In [6]:
# get pixel mean and std per channel

In [7]:
def parse_image(filename, resizing, scale, num_channel):
    
    image = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(image, channels=num_channel)
    image = tf.image.resize(image, resizing)
    image = tf.image.central_crop(image, scale)
    image = tf.cast(image, tf.uint8)
    
    return image

def get_per_pixel_feature(img_data_dict_path, img_file_root_path, resizing, scale, num_channel, sampling_num=10000):
    
    img_data_mapping_dict = load_data(img_data_dict_path)
    pixel_dim = int(resizing[0] * scale)
    
    train_set_eye = list(img_data_mapping_dict["train_set"].keys())
    
    batch_img_data = np.zeros(shape=(sampling_num, pixel_dim, pixel_dim, num_channel))
    for i in range(sampling_num):
        rand_idx = random.randint(0, len(train_set_eye))
        this_eye = train_set_eye[rand_idx]
        this_eye_filename = img_file_root_path + this_eye.split(" ")[0] + "/" + this_eye
        this_eye_data = parse_image(this_eye_filename, resizing=resizing, scale=scale, num_channel=num_channel)
        batch_img_data[i, :, :, :] = this_eye_data
            
    pixel_mean = tf.reduce_mean(batch_img_data, axis=0)
    pixel_std = tf.math.reduce_std(batch_img_data, axis=0)
    
    return pixel_mean, pixel_std

In [8]:
pixel_mean, pixel_std = get_per_pixel_feature("/home/jl5307/current_research/AMD_prediction/img_data/img_data_score_dict.pkl",
                     "/home/jl5307/current_research/AMD_prediction/img_data/img_files/",
                     [256,256], 0.875, 3, sampling_num=10000)

In [9]:
pixel_feature = {"pixel_mean" : pixel_mean, "pixel_std" : pixel_std}

In [11]:
save_data("/home/jl5307/current_research/AMD_prediction/img_data/pixel_feature_224.pkl", pixel_feature)

In [3]:
load_data("/home/jl5307/current_research/AMD_prediction/img_data/img_data_score_dict.pkl")

{'train_set': {'52718 QUA F2 RE LS.jpg': 1.0,
  '57057 08 F2 RE LS.jpg': 1.0,
  '60595 18 F2 RE LS.jpg': 11.0,
  '61081 22 F2 LE LS.jpg': 3.0,
  '52919 20 F2 RE LS.jpg': 7.0,
  '51401 QUA F2 RE LS.jpg': 1.0,
  '61084 QUA F2 LE LS.jpg': 2.0,
  '52369 04 F2 RE LS.jpg': 1.0,
  '59306 08 F2 RE LS.jpg': 1.0,
  '61134 22 F2 LE LS.jpg': 1.0,
  '55643 18 F2 LE LS.jpg': 4.0,
  '54610 16 F2 RE LS.jpg': 9.0,
  '56651 16 F2 LE LS.jpg': 7.0,
  '55641 08 F2 RE LS.jpg': 1.0,
  '58621 QUA F2 LE LS.jpg': 4.0,
  '55642 22 F2 LE LS.jpg': 1.0,
  '61244 06 F2 LE LS.jpg': 1.0,
  '51608 06 F2 RE LS.jpg': 1.0,
  '57132 18 F2 RE LS.jpg': 4.0,
  '59765 16 F2 LE LS.jpg': 3.0,
  '53371 16 F2 RE LS.jpg': 2.0,
  '52510 06 F2 LE LS.jpg': 11.0,
  '58663 12 F2 RE LS.jpg': 5.0,
  '58033 10 F2 RE LS.jpg': 6.0,
  '53133 QUA F2 LE LS.jpg': 4.0,
  '55166 06 F2 LE LS.jpg': 3.0,
  '51626 QUA F2 LE LS.jpg': 1.0,
  '57243 12 F2 LE LS.jpg': 3.0,
  '56337 14 F2 RE LS.jpg': 2.0,
  '58228 04 F2 RE LS.jpg': 1.0,
  '61013 16 F2 RE L

In [None]:
### misc

In [6]:

def build_img_batch(filelist, resizing):
    
    batch_img_data = None
    
    for img_file in file_list:
    
        img_data = parse_image(img_file, resizing)
        
        if batch_img_data == None:
            batch_img_data = img_data
        else:
            batch_img_data = tf.concat([batch_img_data, img_data], axis=-1)
            
    batch_img_data = tf.reshape(batch_img_data, shape=(batch_img_data.shape[-1], batch_img_data.shape[0], batch_img_data.shape[1]))
            
    pixel_mean = tf.reduce_mean(batch_img_data, axis=0)
    pixel_std = tf.math.reduce_std(batch_img_data, axis=0)
    
    # normalize by pixel
    normalized_batch_img_data = tf.divide(tf.math.subtract(batch_img_data, pixel_mean), pixel_std)
    
    normalized_batch_img_data = tf.reshape(normalized_batch_img_data, shape=(normalized_batch_img_data.shape[0],
                                                                            normalized_batch_img_data.shape[1],
                                                                            normalized_batch_img_data.shape[2],
                                                                            1))
            
    return normalized_batch_img_data

In [5]:
b = build_img_batch(file_list, [256, 256])

In [6]:
b.shape

TensorShape([38, 224, 224, 1])

In [54]:
filelist = get_filename("/home/jl5307/current_research/AMD_prediction/img_data/img_files/")

In [8]:
def get_filename(img_file_root_path):
    
    globpath = img_file_root_path + "/*"
    file_list = []

    for filedir in glob(globpath):
        current_dir = filedir + "/*"
        for imgfile in glob(current_dir):
            file_list.append(imgfile)
    
    return file_list

def parse_image(filename, resizing, scale, num_channel=1):
    
    image = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(image, channels=num_channel)
    image = tf.image.resize(image, resizing)
    image = tf.image.central_crop(image, scale)
    
    return image

def build_img_dict(img_file_root_path, eye_feature_dict_path, resizing, scale, num_channel=1, normalize=False):
    
    file_list = get_filename(img_file_root_path)
    eye_feature_dict = load_data(eye_feature_dict_path)
    print("{} img files identified from the given root path...".format(len(file_list)))
    
    file_has_feature = set(eye_feature_dict.keys())
    batch_img_data = None
    img_dict = dict()
    
    count_idx = 0
    for idx, file in enumerate(file_list):
        if idx % 1000 == 0:
            print("{} patients processed...".format(count_idx*1000))
            count_idx += 1
        
        file_name = file.split("/")[-1]
        
        if file_name in file_has_feature:
            this_img_data = parse_image(file, resizing, scale, num_channel=num_channel)
            this_file_dict = dict()
            this_file_dict["data"] = this_img_data
            this_file_dict["label"] = eye_feature_dict[file_name]
            img_dict[file_name] = this_file_dict
            
            # for normalization
            randnum = np.random.uniform()
            count_batch = 0
            if randnum <= 0.1 and count_batch <= 5000:
                count_batch += 1
                if batch_img_data == None:
                    batch_img_data = this_img_data
                else:
                    batch_img_data = tf.concat([batch_img_data, this_img_data], axis=-1)
                    
    if normalize:

        batch_img_data = tf.reshape(batch_img_data, shape=(batch_img_data.shape[-1], batch_img_data.shape[0], batch_img_data.shape[1])) 
        pixel_mean = tf.reduce_mean(batch_img_data, axis=0)
        pixel_std = tf.math.reduce_std(batch_img_data, axis=0)
        del(batch_img_data)
        print("normalize based on {} randomly selected images...".format(count_batch))
    
    # normalize by pixel
        
    return img_dict

In [9]:
img_dict = build_img_dict("/home/jl5307/current_research/AMD_prediction/img_data/img_files/", 
              "/home/jl5307/current_research/AMD_prediction/data/eye_img_binary_dict.pkl", 
              [256, 256], 0.875, num_channel=1, normalize=True)

87854 img files identified from the given root path...
0 patients processed...
100 patients processed...
200 patients processed...
300 patients processed...
400 patients processed...
500 patients processed...
600 patients processed...
700 patients processed...
800 patients processed...
900 patients processed...
1000 patients processed...
1100 patients processed...
1200 patients processed...
1300 patients processed...
1400 patients processed...
1500 patients processed...
1600 patients processed...
1700 patients processed...
1800 patients processed...
1900 patients processed...
2000 patients processed...
2100 patients processed...
2200 patients processed...
2300 patients processed...
2400 patients processed...
2500 patients processed...
2600 patients processed...
2700 patients processed...
2800 patients processed...
2900 patients processed...
3000 patients processed...
3100 patients processed...
3200 patients processed...
3300 patients processed...
3400 patients processed...
3500 patient

KeyboardInterrupt: 