# Chronic Kidney Disease Exploratory Notebook

In [1]:
import pyspark
from pyspark.sql import SparkSession, Window, DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.mllib.stat import Statistics
import sys

sys.path.append('/home/jovyan/work')

In [2]:
spark = SparkSession.builder.getOrCreate()

## Load the Dataset

In [3]:
df = spark.read.csv('../data/chronic_kidney_disease_full.csv',inferSchema=True, header=True)

In [4]:
print('Total record count: {}'.format(df.count()))

Total record count: 400


### Make a Train/Test Split

In [5]:
df_train, df_test = df.randomSplit([0.7, 0.3], seed=42)

# Save the train and test datasets
df_train.write\
    .format('csv')\
    .mode('overwrite')\
    .option("header", "true")\
    .save('../data/chronic_kidney_disease_train')

df_test.write\
    .format('csv')\
    .mode('overwrite')\
    .option("header", "true")\
    .save('../data/chronic_kidney_disease_test')

# Get rid of df so we don't accidentally use it
del df

## Exploring the Data
### Data Definitions:
 - **age** - age
 - **bp** - blood pressure
 - **sg** - specific gravity
 - **al** - albumin
 - **su** - sugar
 - **rbc** - red blood cells
 - **pc** - pus cell
 - **pcc** - pus cell clumps
 - **ba** - bacteria
 - **bgr** - blood glucose random
 - **bu** - blood urea
 - **sc** - serum creatinine
 - **sod** - sodium
 - **pot** - potassium
 - **hemo** - hemoglobin
 - **pcv** - packed cell volume
 - **wc** - white blood cell count
 - **rc** - red blood cell count
 - **htn** - hypertension
 - **dm** - diabetes mellitus
 - **cad** - coronary artery disease
 - **appet** - appetite
 - **pe** - pedal edema
 - **ane** - anemia
 - **class** - class 

### View the Schema

In [6]:
df_train.printSchema()

root
 |-- id: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- bp: double (nullable = true)
 |-- sg: double (nullable = true)
 |-- al: double (nullable = true)
 |-- su: double (nullable = true)
 |-- rbc: string (nullable = true)
 |-- pc: string (nullable = true)
 |-- pcc: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- bgr: double (nullable = true)
 |-- bu: double (nullable = true)
 |-- sc: double (nullable = true)
 |-- sod: double (nullable = true)
 |-- pot: double (nullable = true)
 |-- hemo: double (nullable = true)
 |-- pcv: string (nullable = true)
 |-- wc: string (nullable = true)
 |-- rc: string (nullable = true)
 |-- htn: string (nullable = true)
 |-- dm: string (nullable = true)
 |-- cad: string (nullable = true)
 |-- appet: string (nullable = true)
 |-- pe: string (nullable = true)
 |-- ane: string (nullable = true)
 |-- classification: string (nullable = true)



In [7]:
print('Train record count: {}'.format(df_train.count()))

Train record count: 299


### View Sample Rows

In [8]:
df_train.show(5)

+---+----+----+-----+---+---+------+--------+----------+----------+-----+----+---+-----+----+----+---+----+----+---+---+---+-----+---+---+--------------+
| id| age|  bp|   sg| al| su|   rbc|      pc|       pcc|        ba|  bgr|  bu| sc|  sod| pot|hemo|pcv|  wc|  rc|htn| dm|cad|appet| pe|ane|classification|
+---+----+----+-----+---+---+------+--------+----------+----------+-----+----+---+-----+----+----+---+----+----+---+---+---+-----+---+---+--------------+
|  0|48.0|80.0| 1.02|1.0|0.0|  null|  normal|notpresent|notpresent|121.0|36.0|1.2| null|null|15.4| 44|7800| 5.2|yes|yes| no| good| no| no|           ckd|
|  1| 7.0|50.0| 1.02|4.0|0.0|  null|  normal|notpresent|notpresent| null|18.0|0.8| null|null|11.3| 38|6000|null| no| no| no| good| no| no|           ckd|
|  3|48.0|70.0|1.005|4.0|0.0|normal|abnormal|   present|notpresent|117.0|56.0|3.8|111.0| 2.5|11.2| 32|6700| 3.9|yes| no| no| poor|yes|yes|           ckd|
|  4|51.0|80.0| 1.01|2.0|0.0|normal|  normal|notpresent|notpresent|106.0|26.

## Target Column

In [9]:
freq_table = df_train.select(F.col('classification').cast('string')).groupBy('classification').count().toPandas()
freq_table

Unnamed: 0,classification,count
0,notckd,114
1,ckd,183
2,ckd\t,2


So, this is a binary classification problem with target values of notckd (not chronic kidney disease) and ckd (chronic kidney disease). **Note:** There is an extraneous tab (\t) on at lease one of the target values. We will have to fix that up.

## Continuous Variables
### Summary Statistics

In [10]:
df_train.select('age','bp','sg','al','su','bgr','bu','sc','sod','pot','hemo').describe().toPandas()

Unnamed: 0,summary,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo
0,count,295.0,289.0,260.0,260.0,257.0,268.0,284.0,287.0,238.0,238.0,263.0
1,mean,51.77627118644068,76.9204152249135,1.0176346153846174,1.0269230769230768,0.4435797665369649,150.75746268656715,60.00070422535212,3.0092334494773567,138.03781512605042,4.692857142857143,12.471863117870726
2,stddev,16.65733417948455,14.013779125651489,0.005752383578997,1.3539753259380998,1.0851121532883172,82.71550527569536,53.49978434813889,4.573547394964013,7.061711003346931,3.63387592654839,2.974658737930449
3,min,2.0,50.0,1.005,0.0,0.0,22.0,1.5,0.4,111.0,2.5,3.1
4,max,90.0,180.0,1.025,5.0,5.0,490.0,391.0,48.1,163.0,47.0,17.8


### Null Counts

In [11]:
print('Nulls in age: {}'.format(df_train.where(F.col('age').isNull()).count()))
print('Nulls in blood pressure: {}'.format(df_train.where(F.col('bp').isNull()).count()))
print('Nulls in specific gravity: {}'.format(df_train.where(F.col('sg').isNull()).count()))
print('Nulls in albumin: {}'.format(df_train.where(F.col('al').isNull()).count()))
print('Nulls in sugar: {}'.format(df_train.where(F.col('su').isNull()).count()))
print('Nulls in blood glucose random: {}'.format(df_train.where(F.col('bgr').isNull()).count()))
print('Nulls in blood urea: {}'.format(df_train.where(F.col('bu').isNull()).count()))
print('Nulls in serum creatinine: {}'.format(df_train.where(F.col('sc').isNull()).count()))
print('Nulls in sodium: {}'.format(df_train.where(F.col('sod').isNull()).count()))
print('Nulls in potassium: {}'.format(df_train.where(F.col('pot').isNull()).count()))
print('Nulls in hemoglobin: {}'.format(df_train.where(F.col('hemo').isNull()).count()))

Nulls in age: 4
Nulls in blood pressure: 10
Nulls in specific gravity: 39
Nulls in albumin: 39
Nulls in sugar: 42
Nulls in blood glucose random: 31
Nulls in blood urea: 15
Nulls in serum creatinine: 12
Nulls in sodium: 61
Nulls in potassium: 61
Nulls in hemoglobin: 36


## Categorical Variables

In [12]:
freq_table_rbc = df_train.select(F.col('rbc').cast('string')).groupBy('rbc').count().toPandas()
print('Red blood cells:\n {}'.format(freq_table_rbc))

Red blood cells:
         rbc  count
0      None    111
1    normal    153
2  abnormal     35


In [13]:
freq_table_pc = df_train.select(F.col('pc').cast('string')).groupBy('pc').count().toPandas()
print('Pus cells:\n {}'.format(freq_table_pc))

Pus cells:
          pc  count
0      None     52
1    normal    185
2  abnormal     62


In [14]:
freq_table_pcc = df_train.select(F.col('pcc').cast('string')).groupBy('pcc').count().toPandas()
print('Pus cell clumps:\n {}'.format(freq_table_pcc))

Pus cell clumps:
           pcc  count
0     present     30
1  notpresent    266
2        None      3


In [15]:
freq_table_ba = df_train.select(F.col('ba').cast('string')).groupBy('ba').count().toPandas()
print('Bacteria:\n {}'.format(freq_table_ba))

Bacteria:
            ba  count
0     present     16
1  notpresent    280
2        None      3


In [16]:
freq_table_htn = df_train.select(F.col('htn').cast('string')).groupBy('htn').count().toPandas()
print('Hypertension:\n {}'.format(freq_table_htn))

Hypertension:
     htn  count
0  None      1
1    no    184
2   yes    114


In [17]:
freq_table_dm = df_train.select(F.col('dm').cast('string')).groupBy('dm').count().toPandas()
print('Diabetes mellitus:\n {}'.format(freq_table_dm))

Diabetes mellitus:
       dm  count
0   None      1
1   \tno      3
2  \tyes      2
3     no    193
4    yes    100


In [18]:
freq_table_cad = df_train.select(F.col('cad').cast('string')).groupBy('cad').count().toPandas()
print('Coronary artery disease:\n {}'.format(freq_table_cad))

Coronary artery disease:
     cad  count
0  None      1
1  \tno      1
2    no    271
3   yes     26


In [19]:
freq_table_appet = df_train.select(F.col('appet').cast('string')).groupBy('appet').count().toPandas()
print('Appetite:\n {}'.format(freq_table_appet))

Appetite:
   appet  count
0  poor     56
1  good    243


In [20]:
freq_table_pe = df_train.select(F.col('pe').cast('string')).groupBy('pe').count().toPandas()
print('Pedal edema:\n {}'.format(freq_table_pe))

Pedal edema:
     pe  count
0   no    241
1  yes     58


In [21]:
freq_table_ane = df_train.select(F.col('ane').cast('string')).groupBy('ane').count().toPandas()
print('Anemia:\n {}'.format(freq_table_ane))

Anemia:
    ane  count
0   no    251
1  yes     48
