In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import json
import pydicom
import glob
import os
import random

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
test = pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,AccessionNumber,BitsAllocated,BitsStored,BodyPartExamined,Columns,ConversionType,HighBit,InstanceNumber,LossyImageCompression,...,StudyID,StudyInstanceUID,StudyTime,ViewPosition,class,x,y,width,height,Target
0,0,,8,8,CHEST,1024,WSD,7,1,1,...,,1.2.276.0.7230010.3.1.2.8323329.28530.15178744...,0.0,PA,No Lung Opacity / Not Normal,,,,,0
1,1,,8,8,CHEST,1024,WSD,7,1,1,...,,1.2.276.0.7230010.3.1.2.8323329.26024.15178744...,0.0,PA,No Lung Opacity / Not Normal,,,,,0
2,2,,8,8,CHEST,1024,WSD,7,1,1,...,,1.2.276.0.7230010.3.1.2.8323329.11252.15178743...,0.0,AP,No Lung Opacity / Not Normal,,,,,0
3,3,,8,8,CHEST,1024,WSD,7,1,1,...,,1.2.276.0.7230010.3.1.2.8323329.2293.151787429...,0.0,PA,Normal,,,,,0
4,4,,8,8,CHEST,1024,WSD,7,1,1,...,,1.2.276.0.7230010.3.1.2.8323329.6379.151787432...,0.0,AP,Lung Opacity,264.0,152.0,213.0,379.0,1


In [5]:
test.head()

Unnamed: 0.1,Unnamed: 0,AccessionNumber,BitsAllocated,BitsStored,BodyPartExamined,Columns,ConversionType,HighBit,InstanceNumber,LossyImageCompression,...,SamplesPerPixel,SeriesDescription,SeriesInstanceUID,SeriesNumber,SpecificCharacterSet,StudyDate,StudyID,StudyInstanceUID,StudyTime,ViewPosition
0,0,,8,8,CHEST,1024,WSD,7,1,1,...,1,view: AP,1.2.276.0.7230010.3.1.3.8323329.2012.151787429...,1,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.2012.151787429...,0.0,AP
1,1,,8,8,CHEST,1024,WSD,7,1,1,...,1,view: PA,1.2.276.0.7230010.3.1.3.8323329.21574.15178744...,1,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.21574.15178744...,0.0,PA
2,2,,8,8,CHEST,1024,WSD,7,1,1,...,1,view: PA,1.2.276.0.7230010.3.1.3.8323329.30005.15178744...,1,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.30005.15178744...,0.0,PA
3,3,,8,8,CHEST,1024,WSD,7,1,1,...,1,view: PA,1.2.276.0.7230010.3.1.3.8323329.29261.15178744...,1,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.29261.15178744...,0.0,PA
4,4,,8,8,CHEST,1024,WSD,7,1,1,...,1,view: PA,1.2.276.0.7230010.3.1.3.8323329.8527.151787433...,1,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.8527.151787433...,0.0,PA


In [6]:
def summarise_data(df):
    df_summary = df.describe(include = "all").transpose()
    
    #Overwrite the descibe unique count as it ignore integer/float types.
    df_summary['unique'] = df.T.apply(lambda x: x.nunique(), axis=1)
    #Create a missing %
    df_summary['missing'] = 1 - df_summary['count']/len(df)
    
    #get column data type
    df_summary['dtype'] = df.dtypes
    
    #get the column names and reset index
    df_summary['col'] = df_summary.index
    df_summary = df_summary.reset_index(drop = True)

    #Sort the summary table
    df_summary = df_summary.sort_values(by=['dtype', 'unique','col','missing'])
    
    col_order = ['col','dtype','unique','count','missing','top','freq','mean','std','min','25%','50%','75%','max']
#     col_order = ['col','dtype','unique','count','missing','mean','std','min','25%','50%','75%','max']
    
    return(df_summary[col_order])
    #return(df_summary)

In [7]:
summarise_data(train)

Unnamed: 0,col,dtype,unique,count,missing,top,freq,mean,std,min,25%,50%,75%,max
2,BitsAllocated,int64,1,35875,0.0,,,8.0,0.0,8.0,8.0,8.0,8.0,8.0
3,BitsStored,int64,1,35875,0.0,,,8.0,0.0,8.0,8.0,8.0,8.0,8.0
5,Columns,int64,1,35875,0.0,,,1024.0,0.0,1024.0,1024.0,1024.0,1024.0,1024.0
7,HighBit,int64,1,35875,0.0,,,7.0,0.0,7.0,7.0,7.0,7.0,7.0
8,InstanceNumber,int64,1,35875,0.0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
9,LossyImageCompression,int64,1,35875,0.0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
19,PixelRepresentation,int64,1,35875,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,Rows,int64,1,35875,0.0,,,1024.0,0.0,1024.0,1024.0,1024.0,1024.0,1024.0
25,SamplesPerPixel,int64,1,35875,0.0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
28,SeriesNumber,int64,1,35875,0.0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [8]:
summarise_data(test)

Unnamed: 0,col,dtype,unique,count,missing,top,freq,mean,std,min,25%,50%,75%,max
2,BitsAllocated,int64,1,1000,0,,,8.0,0.0,8.0,8.0,8.0,8.0,8.0
3,BitsStored,int64,1,1000,0,,,8.0,0.0,8.0,8.0,8.0,8.0,8.0
5,Columns,int64,1,1000,0,,,1024.0,0.0,1024.0,1024.0,1024.0,1024.0,1024.0
7,HighBit,int64,1,1000,0,,,7.0,0.0,7.0,7.0,7.0,7.0,7.0
8,InstanceNumber,int64,1,1000,0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
9,LossyImageCompression,int64,1,1000,0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
19,PixelRepresentation,int64,1,1000,0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,Rows,int64,1,1000,0,,,1024.0,0.0,1024.0,1024.0,1024.0,1024.0,1024.0
25,SamplesPerPixel,int64,1,1000,0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
28,SeriesNumber,int64,1,1000,0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [9]:
useless_cols = ['BitsAllocated', 'BitsStored', 'Columns', 'HighBit',
                'InstanceNumber', 'LossyImageCompression', 'PixelRepresentation',
                'Rows', 'SamplesPerPixel', 'SeriesNumber', 'StudyDate', 
                'Unnamed: 0', 'AccessionNumber', 'PatientBirthDate', 'PatientOrientation',
                'ReferringPhysicianName', 'StudyID', 'StudyTime', 'BodyPartExamined',
                'ConversionType', 'LossyImageCompressionMethod', 'Modality', 'PhotometricInterpretation',
                'SOPClassUID', 'SpecificCharacterSet', 'PatientName', 'SOPInstanceUID',
                'SeriesInstanceUID', 'StudyInstanceUID', 'SeriesDescription']

In [10]:
train = train.drop(useless_cols, axis = 1)

In [11]:
test = test.drop(useless_cols, axis = 1)

In [12]:
train.head()

Unnamed: 0,PatientAge,patientId,PatientSex,PixelSpacing,ViewPosition,class,x,y,width,height,Target
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,"['0.14300000000000002', '0.14300000000000002']",PA,No Lung Opacity / Not Normal,,,,,0
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,"['0.19431099999999998', '0.19431099999999998']",PA,No Lung Opacity / Not Normal,,,,,0
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,"['0.168', '0.168']",AP,No Lung Opacity / Not Normal,,,,,0
3,28,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,M,"['0.14300000000000002', '0.14300000000000002']",PA,Normal,,,,,0
4,32,00436515-870c-4b36-a041-de91049b9ab4,F,"['0.139', '0.139']",AP,Lung Opacity,264.0,152.0,213.0,379.0,1


In [13]:
test.head()

Unnamed: 0,PatientAge,patientId,PatientSex,PixelSpacing,ViewPosition
0,47,2cd2f674-1018-4f20-a1ee-cf09928696d8,M,"['0.139', '0.139']",AP
1,39,125a4350-c49f-4056-bfc1-d016c68a1159,M,"['0.168', '0.168']",PA
2,28,21b97415-1275-4483-9705-af9174fc2b4f,M,"['0.14300000000000002', '0.14300000000000002']",PA
3,56,2f06b4d6-f9e2-463a-a8f9-f4b9715b61cb,F,"['0.14300000000000002', '0.14300000000000002']",PA
4,7,219334b4-6696-4b41-90bd-0c56878ccbe6,F,"['0.139', '0.139']",PA


In [14]:
train.PatientAge.value_counts()

58     1184
56     1055
52      962
40      861
55      842
54      831
46      824
59      824
57      798
53      792
49      785
45      733
51      725
43      718
61      704
60      662
48      661
44      652
47      650
50      650
34      635
67      621
62      607
63      605
66      590
33      588
64      588
65      584
32      584
31      578
       ... 
10      132
6       117
11      117
81      108
78      106
79       90
80       67
8        60
7        50
82       46
83       39
86       35
3        33
5        30
87       30
90       29
2        29
4        19
84       19
88       18
85       17
89       10
1         5
91        4
92        2
151       1
153       1
148       1
150       1
155       1
Name: PatientAge, Length: 97, dtype: int64

In [15]:
test.PatientAge.value_counts()

58    43
55    34
56    31
49    31
63    29
54    27
53    26
51    26
57    25
59    25
66    25
52    23
50    23
47    22
64    22
33    22
48    21
43    18
65    18
60    18
32    18
39    17
70    16
46    16
45    16
30    16
40    15
28    15
29    14
44    14
      ..
20     9
31     9
16     8
21     8
71     7
15     7
72     7
13     6
17     5
75     5
73     5
76     4
37     4
9      4
7      3
80     3
12     2
3      2
74     2
77     2
81     2
82     2
85     1
14     1
78     1
10     1
8      1
5      1
4      1
91     1
Name: PatientAge, Length: 79, dtype: int64

In [16]:
train.corr()['Target']

PatientAge   -0.064771
x                  NaN
y                  NaN
width              NaN
height             NaN
Target        1.000000
Name: Target, dtype: float64

In [17]:
train_pneu_only = train[(train['class'] == 'Normal') | (train['class'] == 'Lung Opacity')]

In [18]:
train_pneu_only.head()

Unnamed: 0,PatientAge,patientId,PatientSex,PixelSpacing,ViewPosition,class,x,y,width,height,Target
3,28,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,M,"['0.14300000000000002', '0.14300000000000002']",PA,Normal,,,,,0
4,32,00436515-870c-4b36-a041-de91049b9ab4,F,"['0.139', '0.139']",AP,Lung Opacity,264.0,152.0,213.0,379.0,1
5,32,00436515-870c-4b36-a041-de91049b9ab4,F,"['0.139', '0.139']",AP,Lung Opacity,562.0,152.0,256.0,453.0,1
6,32,00436515-870c-4b36-a041-de91049b9ab4,F,"['0.139', '0.139']",AP,Lung Opacity,264.0,152.0,213.0,379.0,1
7,32,00436515-870c-4b36-a041-de91049b9ab4,F,"['0.139', '0.139']",AP,Lung Opacity,562.0,152.0,256.0,453.0,1


In [19]:
summarise_data(train_pneu_only)

Unnamed: 0,col,dtype,unique,count,missing,top,freq,mean,std,min,25%,50%,75%,max
10,Target,int64,2,24375,0.0,,,0.650256,0.476899,0.0,0.0,1.0,1.0,1.0
0,PatientAge,int64,95,24375,0.0,,,45.111,17.112,1.0,32.0,46.0,58.0,155.0
8,width,float64,346,15850,0.349744,,,221.858,58.8929,40.0,181.0,221.0,262.0,528.0
9,height,float64,722,15850,0.349744,,,343.392,159.224,45.0,214.0,317.0,459.0,942.0
7,y,float64,722,15850,0.349744,,,356.208,148.591,2.0,239.0,349.0,468.0,881.0
6,x,float64,738,15850,0.349744,,,396.751,204.362,2.0,207.0,338.0,594.0,817.0
2,PatientSex,object,2,24375,0.0,M,13915.0,,,,,,,
4,ViewPosition,object,2,24375,0.0,AP,14975.0,,,,,,,
5,class,object,2,24375,0.0,Lung Opacity,15850.0,,,,,,,
3,PixelSpacing,object,14,24375,0.0,"['0.168', '0.168']",9954.0,,,,,,,


In [20]:
train_pneu_only = train_pneu_only.drop('class', axis = 1)

In [21]:
train_no_pneu = train[(train['class'] == 'Normal') | (train['class'] == 'No Lung Opacity / Not Normal')]

In [22]:
train_no_pneu.head()

Unnamed: 0,PatientAge,patientId,PatientSex,PixelSpacing,ViewPosition,class,x,y,width,height,Target
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,"['0.14300000000000002', '0.14300000000000002']",PA,No Lung Opacity / Not Normal,,,,,0
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,"['0.19431099999999998', '0.19431099999999998']",PA,No Lung Opacity / Not Normal,,,,,0
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,"['0.168', '0.168']",AP,No Lung Opacity / Not Normal,,,,,0
3,28,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,M,"['0.14300000000000002', '0.14300000000000002']",PA,Normal,,,,,0
8,54,00569f44-917d-4c86-a842-81832af98c30,M,"['0.139', '0.139']",AP,No Lung Opacity / Not Normal,,,,,0


In [23]:
summarise_data(train_no_pneu)

Unnamed: 0,col,dtype,unique,count,missing,top,freq,mean,std,min,25%,50%,75%,max
10,Target,int64,1,20025,0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,PatientAge,int64,96,20025,0,,,47.4482,16.6132,1.0,35.0,50.0,60.0,155.0
9,height,float64,0,0,1,,,,,,,,,
8,width,float64,0,0,1,,,,,,,,,
6,x,float64,0,0,1,,,,,,,,,
7,y,float64,0,0,1,,,,,,,,,
2,PatientSex,object,2,20025,0,M,11303.0,,,,,,,
4,ViewPosition,object,2,20025,0,PA,12715.0,,,,,,,
5,class,object,2,20025,0,No Lung Opacity / Not Normal,11500.0,,,,,,,
3,PixelSpacing,object,18,20025,0,"['0.14300000000000002', '0.14300000000000002']",7963.0,,,,,,,


In [24]:
train_no_pneu['class'], uniques = pd.factorize(train_no_pneu['class'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
train_no_pneu = train_no_pneu.drop('Target', axis = 1)

In [26]:
train_no_pneu = train_no_pneu.rename(columns = {'class':'Target'})

In [27]:
train_no_pneu.head()

Unnamed: 0,PatientAge,patientId,PatientSex,PixelSpacing,ViewPosition,Target,x,y,width,height
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,"['0.14300000000000002', '0.14300000000000002']",PA,0,,,,
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,"['0.19431099999999998', '0.19431099999999998']",PA,0,,,,
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,"['0.168', '0.168']",AP,0,,,,
3,28,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,M,"['0.14300000000000002', '0.14300000000000002']",PA,1,,,,
8,54,00569f44-917d-4c86-a842-81832af98c30,M,"['0.139', '0.139']",AP,0,,,,


In [28]:
train_no_pneu = train_no_pneu.drop(['x', 'y', 'width', 'height'], axis = 1)

In [29]:
train = train.drop('PixelSpacing', axis = 1)
test = test.drop('PixelSpacing', axis = 1)
train_pneu_only = train_pneu_only.drop('PixelSpacing', axis = 1)
train_no_pneu = train_no_pneu.drop('PixelSpacing', axis = 1)

In [30]:
train.head()

Unnamed: 0,PatientAge,patientId,PatientSex,ViewPosition,class,x,y,width,height,Target
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,PA,No Lung Opacity / Not Normal,,,,,0
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,PA,No Lung Opacity / Not Normal,,,,,0
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,AP,No Lung Opacity / Not Normal,,,,,0
3,28,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,M,PA,Normal,,,,,0
4,32,00436515-870c-4b36-a041-de91049b9ab4,F,AP,Lung Opacity,264.0,152.0,213.0,379.0,1


In [31]:
test.head()

Unnamed: 0,PatientAge,patientId,PatientSex,ViewPosition
0,47,2cd2f674-1018-4f20-a1ee-cf09928696d8,M,AP
1,39,125a4350-c49f-4056-bfc1-d016c68a1159,M,PA
2,28,21b97415-1275-4483-9705-af9174fc2b4f,M,PA
3,56,2f06b4d6-f9e2-463a-a8f9-f4b9715b61cb,F,PA
4,7,219334b4-6696-4b41-90bd-0c56878ccbe6,F,PA


In [32]:
train_pneu_only[train_pneu_only['patientId'] == 'b6862fc0-31f9-4091-b8b1-256192168a0f']

Unnamed: 0,PatientAge,patientId,PatientSex,ViewPosition,x,y,width,height,Target
24803,66,b6862fc0-31f9-4091-b8b1-256192168a0f,M,AP,182.0,192.0,293.0,565.0,1
24804,66,b6862fc0-31f9-4091-b8b1-256192168a0f,M,AP,627.0,253.0,209.0,523.0,1
24805,66,b6862fc0-31f9-4091-b8b1-256192168a0f,M,AP,182.0,192.0,293.0,565.0,1
24806,66,b6862fc0-31f9-4091-b8b1-256192168a0f,M,AP,627.0,253.0,209.0,523.0,1


In [33]:
train_pneu_only.shape

(24375, 9)

In [34]:
train_no_pneu.head()

Unnamed: 0,PatientAge,patientId,PatientSex,ViewPosition,Target
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,PA,0
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,PA,0
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,AP,0
3,28,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,M,PA,1
8,54,00569f44-917d-4c86-a842-81832af98c30,M,AP,0


In [35]:
train_pneu_only.to_csv('train_pneu_only.csv')

In [36]:
train_no_pneu.to_csv('train_no_pneu.csv')

In [37]:
train_pneu_not_normal = train[(train['class'] == 'Lung Opacity') | (train['class'] == 'No Lung Opacity / Not Normal')]

In [38]:
train_pneu_not_normal.head()

Unnamed: 0,PatientAge,patientId,PatientSex,ViewPosition,class,x,y,width,height,Target
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,PA,No Lung Opacity / Not Normal,,,,,0
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,PA,No Lung Opacity / Not Normal,,,,,0
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,AP,No Lung Opacity / Not Normal,,,,,0
4,32,00436515-870c-4b36-a041-de91049b9ab4,F,AP,Lung Opacity,264.0,152.0,213.0,379.0,1
5,32,00436515-870c-4b36-a041-de91049b9ab4,F,AP,Lung Opacity,562.0,152.0,256.0,453.0,1


In [39]:
train_pneu_not_normal = train_pneu_not_normal.drop('class', axis = 1)

In [40]:
train_pneu_not_normal.to_csv('train_pneu_not_normal.csv')

In [41]:
train_pneu_not_normal.head()

Unnamed: 0,PatientAge,patientId,PatientSex,ViewPosition,x,y,width,height,Target
0,51,0004cfab-14fd-4e49-80ba-63a80b6bddd6,F,PA,,,,,0
1,48,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,F,PA,,,,,0
2,19,00322d4d-1c29-4943-afc9-b6754be640eb,M,AP,,,,,0
4,32,00436515-870c-4b36-a041-de91049b9ab4,F,AP,264.0,152.0,213.0,379.0,1
5,32,00436515-870c-4b36-a041-de91049b9ab4,F,AP,562.0,152.0,256.0,453.0,1
