# Chapter 05. Introducing MLlib
 - Mllib stands for Machine Learning Library. 
 - Learning how to do the following
  - Prepare the data for modeling with MLlib
  - Perform statistical testing
  - Predict survival chances of infants using logistic regression
  - Select the most predictable features and train a random forest model

## Loading and transforming the data
 - read data and convert it to a DataFrame.

In [50]:
import pyspark.sql.types as typ

In [51]:
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.StringType()),
    ('BIRTH_YEAR', typ.IntegerType()),
    ('BIRTH_MONTH', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('MOTHER_RACE_6CODE', typ.StringType()),
    ('MOTHER_EDUCATION', typ.StringType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('FATHER_EDUCATION', typ.StringType()),
    ('MONTH_PRECARE_RECODE', typ.StringType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_BMI_RECODE', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.StringType()),
    ('DIABETES_GEST', typ.StringType()),
    ('HYP_TENS_PRE', typ.StringType()),
    ('HYP_TENS_GEST', typ.StringType()),
    ('PREV_BIRTH_PRETERM', typ.StringType()),
    ('NO_RISK', typ.StringType()),
    ('NO_INFECTIONS_REPORTED', typ.StringType()),
    ('LABOR_IND', typ.StringType()),
    ('LABOR_AUGM', typ.StringType()),
    ('STEROIDS', typ.StringType()),
    ('ANTIBIOTICS', typ.StringType()),
    ('ANESTHESIA', typ.StringType()),
    ('DELIV_METHOD_RECODE_COMB', typ.StringType()),
    ('ATTENDANT_BIRTH', typ.StringType()),
    ('APGAR_5', typ.IntegerType()),
    ('APGAR_5_RECODE', typ.StringType()),
    ('APGAR_10', typ.IntegerType()),
    ('APGAR_10_RECODE', typ.StringType()),
    ('INFANT_SEX', typ.StringType()),
    ('OBSTETRIC_GESTATION_WEEKS', typ.IntegerType()),
    ('INFANT_WEIGHT_GRAMS', typ.IntegerType()),
    ('INFANT_ASSIST_VENTI', typ.StringType()),
    ('INFANT_ASSIST_VENTI_6HRS', typ.StringType()),
    ('INFANT_NICU_ADMISSION', typ.StringType()),
    ('INFANT_SURFACANT', typ.StringType()),
    ('INFANT_ANTIBIOTICS', typ.StringType()),
    ('INFANT_SEIZURES', typ.StringType()),
    ('INFANT_NO_ABNORMALITIES', typ.StringType()),
    ('INFANT_ANCEPHALY', typ.StringType()),
    ('INFANT_MENINGOMYELOCELE', typ.StringType()),
    ('INFANT_LIMB_REDUCTION', typ.StringType()),
    ('INFANT_DOWN_SYNDROME', typ.StringType()),
    ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()),
    ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()),
    ('INFANT_BREASTFED', typ.StringType())
]

schema = typ.StructType([
        typ.StructField(e[0], e[1], False) for e in labels
    ])

 - we load the data.

In [52]:
births = spark.read.csv('births_train.csv.gz', 
                        header=True, 
                        schema=schema)

In [53]:
births.show(1)

+----------------------+----------+-----------+-----------+----------------+-----------------+----------------+-------------------+----------------+--------------------+----------+---------+---------+---------+----------------+-----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+-------+----------------------+---------+----------+--------+-----------+----------+------------------------+---------------+-------+--------------+--------+---------------+----------+-------------------------+-------------------+-------------------+------------------------+---------------------+----------------+------------------+---------------+-----------------------+----------------+-----------------------+---------------------+--------------------+-------------------------------------+--------------------------------------+----------------+
|INFANT_ALIVE_AT_REPORT|BIRTH_YEAR|BIRTH_MONTH|BIRTH_PLACE|MOTHER_AGE_

 - Specify our recode dictionary.

In [54]:
recode_dictionary = {
    'YNU': {
        'Y': 1,
        'N': 0,
        'U': 0
    }
}

Our goal is to predict whether the 'INFANT_ALIVE_AT_REPORT' is either 1 or 0. Thus, we will drop all of the features that relate to the infant.

In [55]:
selected_features = [
    'INFANT_ALIVE_AT_REPORT', 
    'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_BEFORE', 
    'CIG_1_TRI', 
    'CIG_2_TRI', 
    'CIG_3_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'MOTHER_DELIVERY_WEIGHT', 
    'MOTHER_WEIGHT_GAIN', 
    'DIABETES_PRE', 
    'DIABETES_GEST', 
    'HYP_TENS_PRE', 
    'HYP_TENS_GEST', 
    'PREV_BIRTH_PRETERM'
]

In [56]:
births_trimmed = births.select(selected_features)

 - There's a small problem with how the number of cigarettes smoked by the mother was coded. 
  - as 0 means the mother smoked no cigarettes before of during the pregnancy.
  - between 1-97 states the actual number of cigarette smoked
  - 98 indicates either 98 or more, whereas 99 identifies the unknown =>  we will assume the unknown is 0 and recode accordingly
 - Next we will specify our recoding methods.

In [57]:
import pyspark.sql.functions as func

In [58]:
def recode(col,key):
    return recode_dictionary[key][col]

def correct_cig(feat):
    return func.when(func.col(feat)!=99, func.col(feat)).otherwise(0) # 99면 0으로 출력하고 99가 아니면 그대로 출력.

In [59]:
rec_integer = func.udf(recode, typ.IntegerType())

In [60]:
birth_transformed = births_trimmed \
                .withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \
                .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \
                .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \
                .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI')) \

 - withColumn(...) Method takes the name of the Column as its first parameter and the transformation as the second one.

In [61]:
for col in births_trimmed.schema:
    print(col)

StructField(INFANT_ALIVE_AT_REPORT,StringType,true)
StructField(BIRTH_PLACE,StringType,true)
StructField(MOTHER_AGE_YEARS,IntegerType,true)
StructField(FATHER_COMBINED_AGE,IntegerType,true)
StructField(CIG_BEFORE,IntegerType,true)
StructField(CIG_1_TRI,IntegerType,true)
StructField(CIG_2_TRI,IntegerType,true)
StructField(CIG_3_TRI,IntegerType,true)
StructField(MOTHER_HEIGHT_IN,IntegerType,true)
StructField(MOTHER_PRE_WEIGHT,IntegerType,true)
StructField(MOTHER_DELIVERY_WEIGHT,IntegerType,true)
StructField(MOTHER_WEIGHT_GAIN,IntegerType,true)
StructField(DIABETES_PRE,StringType,true)
StructField(DIABETES_GEST,StringType,true)
StructField(HYP_TENS_PRE,StringType,true)
StructField(HYP_TENS_GEST,StringType,true)
StructField(PREV_BIRTH_PRETERM,StringType,true)


In [62]:
cols = [(col.name, col.dataType) for col in births_trimmed.schema]
cols

[('INFANT_ALIVE_AT_REPORT', StringType),
 ('BIRTH_PLACE', StringType),
 ('MOTHER_AGE_YEARS', IntegerType),
 ('FATHER_COMBINED_AGE', IntegerType),
 ('CIG_BEFORE', IntegerType),
 ('CIG_1_TRI', IntegerType),
 ('CIG_2_TRI', IntegerType),
 ('CIG_3_TRI', IntegerType),
 ('MOTHER_HEIGHT_IN', IntegerType),
 ('MOTHER_PRE_WEIGHT', IntegerType),
 ('MOTHER_DELIVERY_WEIGHT', IntegerType),
 ('MOTHER_WEIGHT_GAIN', IntegerType),
 ('DIABETES_PRE', StringType),
 ('DIABETES_GEST', StringType),
 ('HYP_TENS_PRE', StringType),
 ('HYP_TENS_GEST', StringType),
 ('PREV_BIRTH_PRETERM', StringType)]

In [63]:
for i, s in enumerate(cols):
    print(i,s,s[1])

0 ('INFANT_ALIVE_AT_REPORT', StringType) StringType
1 ('BIRTH_PLACE', StringType) StringType
2 ('MOTHER_AGE_YEARS', IntegerType) IntegerType
3 ('FATHER_COMBINED_AGE', IntegerType) IntegerType
4 ('CIG_BEFORE', IntegerType) IntegerType
5 ('CIG_1_TRI', IntegerType) IntegerType
6 ('CIG_2_TRI', IntegerType) IntegerType
7 ('CIG_3_TRI', IntegerType) IntegerType
8 ('MOTHER_HEIGHT_IN', IntegerType) IntegerType
9 ('MOTHER_PRE_WEIGHT', IntegerType) IntegerType
10 ('MOTHER_DELIVERY_WEIGHT', IntegerType) IntegerType
11 ('MOTHER_WEIGHT_GAIN', IntegerType) IntegerType
12 ('DIABETES_PRE', StringType) StringType
13 ('DIABETES_GEST', StringType) StringType
14 ('HYP_TENS_PRE', StringType) StringType
15 ('HYP_TENS_GEST', StringType) StringType
16 ('PREV_BIRTH_PRETERM', StringType) StringType


In [64]:
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        print(births.select(s[0]).distinct().rdd.collect()) 

[Row(INFANT_ALIVE_AT_REPORT='Y'), Row(INFANT_ALIVE_AT_REPORT='N')]
[Row(BIRTH_PLACE='7'), Row(BIRTH_PLACE='3'), Row(BIRTH_PLACE='5'), Row(BIRTH_PLACE='6'), Row(BIRTH_PLACE='9'), Row(BIRTH_PLACE='1'), Row(BIRTH_PLACE='4'), Row(BIRTH_PLACE='2')]
[Row(DIABETES_PRE='Y'), Row(DIABETES_PRE='U'), Row(DIABETES_PRE='N')]
[Row(DIABETES_GEST='Y'), Row(DIABETES_GEST='U'), Row(DIABETES_GEST='N')]
[Row(HYP_TENS_PRE='Y'), Row(HYP_TENS_PRE='U'), Row(HYP_TENS_PRE='N')]
[Row(HYP_TENS_GEST='Y'), Row(HYP_TENS_GEST='U'), Row(HYP_TENS_GEST='N')]
[Row(PREV_BIRTH_PRETERM='Y'), Row(PREV_BIRTH_PRETERM='U'), Row(PREV_BIRTH_PRETERM='N')]


 - births.select(s[0]) : 컬럼명으로 데이터 가지고오기. 
 - distinct : 중복되지 않는 값만 가지고 오기. 
 - 해당 결과값이 DataFrame형태로 되어있어 RDD로 변형이 필요하다. .RDD
 - 각 해당되는 Column들의 데이터를 추출 map(row:row[0]).collect()

In [65]:
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        print(births.select(s[0]).distinct().rdd.map(lambda row:row[0]).collect())

['Y', 'N']
['7', '3', '5', '6', '9', '1', '4', '2']
['Y', 'U', 'N']
['Y', 'U', 'N']
['Y', 'U', 'N']
['Y', 'U', 'N']
['Y', 'U', 'N']


In [66]:
YNU_cols = []
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        dis = births.select(s[0]).distinct().rdd.map(lambda row:row[0]).collect()
        if 'Y' in dis: # 출력된 값들 중 Y가 포함된다면 해당 값들을 LIST에 Column name을 Append
            YNU_cols.append(s[0])

In [67]:
YNU_cols

['INFANT_ALIVE_AT_REPORT',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [68]:
births.select([
        'INFANT_NICU_ADMISSION', 
        rec_integer(
            'INFANT_NICU_ADMISSION', func.lit('YNU')
        ) \
        .alias('INFANT_NICU_ADMISSION_RECODE')]
     ).take(5) # Y일 경우 1 나머지는 모두 0으로 변환. 


[Row(INFANT_NICU_ADMISSION='Y', INFANT_NICU_ADMISSION_RECODE=1),
 Row(INFANT_NICU_ADMISSION='Y', INFANT_NICU_ADMISSION_RECODE=1),
 Row(INFANT_NICU_ADMISSION='U', INFANT_NICU_ADMISSION_RECODE=0),
 Row(INFANT_NICU_ADMISSION='N', INFANT_NICU_ADMISSION_RECODE=0),
 Row(INFANT_NICU_ADMISSION='U', INFANT_NICU_ADMISSION_RECODE=0)]

In [69]:
exprs_YNU = [
    rec_integer(x, func.lit('YNU')). alias(x)
    if x in YNU_cols else x for x in births_trimmed.columns
]

birth_transformed = birth_transformed.select(exprs_YNU)

 - Let's Check if we got it Collectly

In [70]:
birth_transformed.select(YNU_cols[-5:]).show(5)

+------------+-------------+------------+-------------+------------------+
|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|
+------------+-------------+------------+-------------+------------------+
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 1|
|           0|            0|           0|            0|                 0|
+------------+-------------+------------+-------------+------------------+
only showing top 5 rows



# Getting to know your data
## Descriptive statistics 
 - DataFrames exose the .describe method
 - with LMlib, we will use the .colStats(...) method 
 - RDD of data to calculate the descripticve statistics of and return a MultivariateStatisticalSummary object that contains the following descriptive statistics
  - count()
  - max()
  - mean()
  - min()
  - normL1() : value of the L1-Norm for the values in the column
  - normL2() : value of the L2-Norm for the values in the column
  - numNonZeros() : nunmber of nonzero values in the column
  - variance()

In [None]:
import pyspark