In [1]:
lidc_path = "/home/mengdi/DataShare/LIDC/LIDC-IDRI/DOI/LIDC-IDRI-0001"

In [2]:
import os
import glob
import json
import warnings

from bs4 import BeautifulSoup
import dicom
import numpy as np
from matplotlib.path import Path
import pandas as pd

In [3]:
class PathParser:

    def __init__(self, folder):
        self.folder = folder
        self.target_folder = self._select_target_sub_folder(
            glob.glob(folder + "/*"))
        self.xml_path = self._get_xml_path()
        self.dcm_paths = self._get_dcm_paths()

    def _select_target_sub_folder(self, subfolders):
        file_count = [len(glob.glob(p + '/*/*.dcm')) for p in subfolders]
        target = max(zip(file_count, subfolders))
        return target[1]

    def _get_xml_path(self):
        xml_path = glob.glob(self.target_folder + "/*/*.xml")
#         assert len(xml_path) == 1, self.folder
        return xml_path[0]

    def _get_dcm_paths(self):
        dcm_paths = glob.glob(self.target_folder + "/*/*.dcm")
        assert dcm_paths, self.folder
        return dcm_paths

In [15]:
class XMLParser:

    def __init__(self, path):
        self.path = path
        with open(path) as f:
            self.soup = BeautifulSoup(f.read(), "xml")
        self.parsed_dict = self._parse()

    def pprint(self):
        print(self.soup.prettify())

    def _parse(self):
        self.ret = {}
        readingSessions = self.soup.find_all('readingSession')
#         assert len(readingSessions) == 4, self.path
        return {'readingSession':
                [self._parse_unblindedReadNodule(sess.find_all('unblindedReadNodule'), sess.find_all('nonNodule'))
                 for sess in readingSessions]}

    def _parse_unblindedReadNodule(self, unblindedReadNodules, nonNodules):
        nodule_large_than_3mm = [
            node for node in unblindedReadNodules if node.find('characteristics')]
        
        nodule_small_than_3mm = [
            node for node in unblindedReadNodules if not node.find('characteristics')]
        
        if not nodule_large_than_3mm:
            warnings.warn("no large nodule for " + self.path)
        return {"large_nodules": [{"nodule_feature": self._parse_malignancy(nodule),
                                   "roi": [self._parse_roi(roi) for roi in nodule.find_all('roi')]}
                                  for nodule in nodule_large_than_3mm],
               "small_nodules":[{"roi": [self._parse_roi(roi) for roi in nodule.find_all('roi')]} 
                                for nodule in nodule_small_than_3mm],
               "unnodles":[{"roi": [self._parse_unnodule_roi(unnodule)]} 
                                for unnodule in nonNodules]}

    def _parse_malignancy(self, nodule):
        feature_list = ['subtlety', 'internalStructure', 'calcification', 'sphericity',
                       'margin', 'lobulation', 'spiculation', 'texture', 'malignancy']
        try:
            feature = [{f: int(nodule.find('characteristics').find(f).string) for f in feature_list}]
            return feature
        except:
            return []

    def _parse_roi(self, roi):
        imageZposition = float(roi.find('imageZposition').string)
        xCoords = [int(x.string) for x in roi.find_all('xCoord')]
        yCoords = [int(x.string) for x in roi.find_all('yCoord')]
        assert len(xCoords) == len(yCoords), self.path
        return {"imageZposition": imageZposition,
                "coords": list(zip(xCoords, yCoords))}
    
    def _parse_unnodule_roi(self, unnodule):
        imageZposition = float(unnodule.find('imageZposition').string)
        xCoords = int(unnodule.find('xCoord').string)
        yCoords = int(unnodule.find('yCoord').string)
#         assert len(xCoords) == len(yCoords), self.path
        return {"imageZposition": imageZposition,
                "coords": list([xCoords, yCoords])}

    def dump(self, filename):
        with open(filename, 'w') as f:
            f.write(self.dumps())

    def dumps(self):
        return json.dumps(self.parsed_dict, indent=4)

# create xml_json

In [17]:
xml_json_path = "/home/mengdi/yuxiang.ye/DN_AI_frame/workspace/lidc/xml_json"
for lidc_idri in sorted(glob.glob("/home/mengdi/DataShare/LIDC/LIDC-IDRI/DOI/*")):
    basename = os.path.basename(lidc_idri)
    if os.path.exists(os.path.join(xml_json_path, basename + ".json")):
        continue
#     try:
    parse = PathParser(lidc_idri)
    xmlparse = XMLParser(parse.xml_path)
    xmlparse.dump(os.path.join(xml_json_path, basename + ".json"))
#     except Exception as e:
#         print(e)
#         warnings.warn("exeption: %s" %basename)

































In [18]:
import json
json_path = "/home/mengdi/yuxiang.ye/DN_AI_frame/workspace/lidc/xml_json/LIDC-IDRI-0001.json"
with open(json_path, 'r') as f:
    json_data = json.load(f)

In [93]:
def create_lidc_info(seriesuid, json_data):
    feature_list = ["calcification", "internalStructure", "lobulation", "malignancy",
                                            "margin", "sphericity", "spiculation", "subtlety", "texture"]  
    df_unnodule_list = []
    df_nodule_list = []
    df_large_nodule_list = []

    for index,readingSession in enumerate(json_data['readingSession']):
        for unnodule in readingSession["unnodles"]:
            for roi in unnodule['roi']:
                df_unnodule_list.append([index, roi['imageZposition'], roi['coords'][1], roi['coords'][0]])

    for index,readingSession in enumerate(json_data['readingSession']):
        for nodule in readingSession["small_nodules"]:
            for roi in nodule['roi']:
                df_nodule_list.append([index, roi['imageZposition'], roi['coords'][0][1], roi['coords'][0][0]])

             
    for index,readingSession in enumerate(json_data['readingSession']):
        for nodule in readingSession["large_nodules"]:
            rois = nodule['roi']
            for feature in nodule['nodule_feature']:
                calcification = feature["calcification"]
                internalStructure = feature['internalStructure']
                lobulation = feature['lobulation']
                malignancy = feature['malignancy']
                margin = feature['margin']
                sphericity = feature['sphericity']
                spiculation = feature['spiculation']
                subtlety = feature['subtlety']
                texture = feature['texture']
                df_large_nodule_list.append([index, rois, calcification, internalStructure, lobulation, malignancy,
                                            margin, sphericity, spiculation, subtlety, texture])
    
    df_unnodule = pd.DataFrame(df_unnodule_list, columns=['doctor_id', "imageZposition",'coordy','coordx'])
    df_unnodule['parse_type'] = "unnodule"
    df_nodule = pd.DataFrame(df_nodule_list, columns=['doctor_id', "imageZposition",'coordy','coordx'])
    df_nodule['parse_type'] = "nodule"
    df_large_nodule = pd.DataFrame(df_large_nodule_list, columns=(['doctor_id', 'rois'] + feature_list))
    df_large_nodule['parse_type'] = "large_nodule"
    nodule_info = pd.concat([df_unnodule, df_nodule, df_large_nodule]).reset_index(drop=True)
    nodule_info['seriesuid'] = seriesuid
    newcolumns = ['seriesuid', 'doctor_id', 'parse_type',"imageZposition",'coordy','coordx', 'rois'] + feature_list
    return nodule_info[newcolumns]

In [94]:
debug = create_lidc_info("name", json_data)

In [108]:
json_path = "/home/mengdi/yuxiang.ye/DN_AI_frame/workspace/lidc/xml_json/*.json"
nodule_info_list = []
for js in sorted(glob.glob(json_path)):
    js_name = os.path.basename(js)[:-5]
    print(js_name)
    with open(js, 'r') as f:
        json_data = json.load(f)
        nodule_info = create_lidc_info(js_name, json_data)
        nodule_info_list.append(nodule_info)

In [110]:
nodule_indo_df = pd.concat(nodule_info_list)

In [111]:
nodule_indo_df.to_csv("/home/mengdi/yuxiang.ye/DN_AI_frame/workspace/lidc/LIDC_INFO.csv", index=None)

In [112]:
feature_list

['calcification',
 'internalStructure',
 'lobulation',
 'malignancy',
 'margin',
 'sphericity',
 'spiculation',
 'subtlety',
 'texture']

In [114]:
nodule_indo_df.calcification.value_counts()

6.0    5951
3.0     709
5.0      75
4.0      42
2.0      12
1.0       4
0.0       2
Name: calcification, dtype: int64

In [116]:
nodule_indo_df.internalStructure.value_counts()

1.0    6756
4.0      24
2.0      11
0.0       2
5.0       1
3.0       1
Name: internalStructure, dtype: int64

In [117]:
nodule_indo_df.lobulation.value_counts()

1.0    4084
2.0    1440
3.0     705
4.0     381
5.0     183
0.0       2
Name: lobulation, dtype: int64

In [118]:
# 球度; 球性; 球状，球形
nodule_indo_df.sphericity.value_counts()

4.0    2293
3.0    2079
5.0    1824
2.0     578
1.0      19
0.0       2
Name: sphericity, dtype: int64

In [120]:
# 质地; 结构; 本质
nodule_indo_df.texture.value_counts()

5.0    4977
4.0     808
1.0     479
3.0     378
2.0     151
0.0       2
Name: texture, dtype: int64