# Chapter 05. Introducing MLlib
 - Mllib stands for Machine Learning Library. 
 - Learning how to do the following
  - Prepare the data for modeling with MLlib
  - Perform statistical testing
  - Predict survival chances of infants using logistic regression
  - Select the most predictable features and train a random forest model

## 5.1 Loading and transforming the data
 - read data and convert it to a DataFrame.

In [1]:
import pyspark.sql.types as typ

In [2]:
labels = [
    ('INFANT_ALIVE_AT_REPORT', typ.StringType()),
    ('BIRTH_YEAR', typ.IntegerType()),
    ('BIRTH_MONTH', typ.IntegerType()),
    ('BIRTH_PLACE', typ.StringType()),
    ('MOTHER_AGE_YEARS', typ.IntegerType()),
    ('MOTHER_RACE_6CODE', typ.StringType()),
    ('MOTHER_EDUCATION', typ.StringType()),
    ('FATHER_COMBINED_AGE', typ.IntegerType()),
    ('FATHER_EDUCATION', typ.StringType()),
    ('MONTH_PRECARE_RECODE', typ.StringType()),
    ('CIG_BEFORE', typ.IntegerType()),
    ('CIG_1_TRI', typ.IntegerType()),
    ('CIG_2_TRI', typ.IntegerType()),
    ('CIG_3_TRI', typ.IntegerType()),
    ('MOTHER_HEIGHT_IN', typ.IntegerType()),
    ('MOTHER_BMI_RECODE', typ.IntegerType()),
    ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
    ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
    ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
    ('DIABETES_PRE', typ.StringType()),
    ('DIABETES_GEST', typ.StringType()),
    ('HYP_TENS_PRE', typ.StringType()),
    ('HYP_TENS_GEST', typ.StringType()),
    ('PREV_BIRTH_PRETERM', typ.StringType()),
    ('NO_RISK', typ.StringType()),
    ('NO_INFECTIONS_REPORTED', typ.StringType()),
    ('LABOR_IND', typ.StringType()),
    ('LABOR_AUGM', typ.StringType()),
    ('STEROIDS', typ.StringType()),
    ('ANTIBIOTICS', typ.StringType()),
    ('ANESTHESIA', typ.StringType()),
    ('DELIV_METHOD_RECODE_COMB', typ.StringType()),
    ('ATTENDANT_BIRTH', typ.StringType()),
    ('APGAR_5', typ.IntegerType()),
    ('APGAR_5_RECODE', typ.StringType()),
    ('APGAR_10', typ.IntegerType()),
    ('APGAR_10_RECODE', typ.StringType()),
    ('INFANT_SEX', typ.StringType()),
    ('OBSTETRIC_GESTATION_WEEKS', typ.IntegerType()),
    ('INFANT_WEIGHT_GRAMS', typ.IntegerType()),
    ('INFANT_ASSIST_VENTI', typ.StringType()),
    ('INFANT_ASSIST_VENTI_6HRS', typ.StringType()),
    ('INFANT_NICU_ADMISSION', typ.StringType()),
    ('INFANT_SURFACANT', typ.StringType()),
    ('INFANT_ANTIBIOTICS', typ.StringType()),
    ('INFANT_SEIZURES', typ.StringType()),
    ('INFANT_NO_ABNORMALITIES', typ.StringType()),
    ('INFANT_ANCEPHALY', typ.StringType()),
    ('INFANT_MENINGOMYELOCELE', typ.StringType()),
    ('INFANT_LIMB_REDUCTION', typ.StringType()),
    ('INFANT_DOWN_SYNDROME', typ.StringType()),
    ('INFANT_SUSPECTED_CHROMOSOMAL_DISORDER', typ.StringType()),
    ('INFANT_NO_CONGENITAL_ANOMALIES_CHECKED', typ.StringType()),
    ('INFANT_BREASTFED', typ.StringType())
]

schema = typ.StructType([
        typ.StructField(e[0], e[1], False) for e in labels
    ])

 - we load the data.

In [3]:
births = spark.read.csv('births_train.csv.gz', 
                        header=True, 
                        schema=schema)

In [4]:
births.show(1)

+----------------------+----------+-----------+-----------+----------------+-----------------+----------------+-------------------+----------------+--------------------+----------+---------+---------+---------+----------------+-----------------+-----------------+----------------------+------------------+------------+-------------+------------+-------------+------------------+-------+----------------------+---------+----------+--------+-----------+----------+------------------------+---------------+-------+--------------+--------+---------------+----------+-------------------------+-------------------+-------------------+------------------------+---------------------+----------------+------------------+---------------+-----------------------+----------------+-----------------------+---------------------+--------------------+-------------------------------------+--------------------------------------+----------------+
|INFANT_ALIVE_AT_REPORT|BIRTH_YEAR|BIRTH_MONTH|BIRTH_PLACE|MOTHER_AGE_

 - Specify our recode dictionary.

In [5]:
recode_dictionary = {
    'YNU': {
        'Y': 1,
        'N': 0,
        'U': 0
    }
}

Our goal is to predict whether the 'INFANT_ALIVE_AT_REPORT' is either 1 or 0. Thus, we will drop all of the features that relate to the infant.

In [6]:
selected_features = [
    'INFANT_ALIVE_AT_REPORT', 
    'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_BEFORE', 
    'CIG_1_TRI', 
    'CIG_2_TRI', 
    'CIG_3_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'MOTHER_DELIVERY_WEIGHT', 
    'MOTHER_WEIGHT_GAIN', 
    'DIABETES_PRE', 
    'DIABETES_GEST', 
    'HYP_TENS_PRE', 
    'HYP_TENS_GEST', 
    'PREV_BIRTH_PRETERM'
]

In [7]:
births_trimmed = births.select(selected_features)

 - There's a small problem with how the number of cigarettes smoked by the mother was coded. 
  - as 0 means the mother smoked no cigarettes before of during the pregnancy.
  - between 1-97 states the actual number of cigarette smoked
  - 98 indicates either 98 or more, whereas 99 identifies the unknown =>  we will assume the unknown is 0 and recode accordingly
 - Next we will specify our recoding methods.

In [8]:
import pyspark.sql.functions as func

In [9]:
def recode(col,key):
    return recode_dictionary[key][col]

def correct_cig(feat):
    return func.when(func.col(feat)!=99, func.col(feat)).otherwise(0) # 99면 0으로 출력하고 99가 아니면 그대로 출력.

In [10]:
rec_integer = func.udf(recode, typ.IntegerType())

In [11]:
birth_transformed = births_trimmed \
                .withColumn('CIG_BEFORE', correct_cig('CIG_BEFORE')) \
                .withColumn('CIG_1_TRI', correct_cig('CIG_1_TRI')) \
                .withColumn('CIG_2_TRI', correct_cig('CIG_2_TRI')) \
                .withColumn('CIG_3_TRI', correct_cig('CIG_3_TRI')) \

 - withColumn(...) Method takes the name of the Column as its first parameter and the transformation as the second one.

In [12]:
for col in births_trimmed.schema:
    print(col)

StructField(INFANT_ALIVE_AT_REPORT,StringType,true)
StructField(BIRTH_PLACE,StringType,true)
StructField(MOTHER_AGE_YEARS,IntegerType,true)
StructField(FATHER_COMBINED_AGE,IntegerType,true)
StructField(CIG_BEFORE,IntegerType,true)
StructField(CIG_1_TRI,IntegerType,true)
StructField(CIG_2_TRI,IntegerType,true)
StructField(CIG_3_TRI,IntegerType,true)
StructField(MOTHER_HEIGHT_IN,IntegerType,true)
StructField(MOTHER_PRE_WEIGHT,IntegerType,true)
StructField(MOTHER_DELIVERY_WEIGHT,IntegerType,true)
StructField(MOTHER_WEIGHT_GAIN,IntegerType,true)
StructField(DIABETES_PRE,StringType,true)
StructField(DIABETES_GEST,StringType,true)
StructField(HYP_TENS_PRE,StringType,true)
StructField(HYP_TENS_GEST,StringType,true)
StructField(PREV_BIRTH_PRETERM,StringType,true)


In [13]:
cols = [(col.name, col.dataType) for col in births_trimmed.schema]
cols

[('INFANT_ALIVE_AT_REPORT', StringType),
 ('BIRTH_PLACE', StringType),
 ('MOTHER_AGE_YEARS', IntegerType),
 ('FATHER_COMBINED_AGE', IntegerType),
 ('CIG_BEFORE', IntegerType),
 ('CIG_1_TRI', IntegerType),
 ('CIG_2_TRI', IntegerType),
 ('CIG_3_TRI', IntegerType),
 ('MOTHER_HEIGHT_IN', IntegerType),
 ('MOTHER_PRE_WEIGHT', IntegerType),
 ('MOTHER_DELIVERY_WEIGHT', IntegerType),
 ('MOTHER_WEIGHT_GAIN', IntegerType),
 ('DIABETES_PRE', StringType),
 ('DIABETES_GEST', StringType),
 ('HYP_TENS_PRE', StringType),
 ('HYP_TENS_GEST', StringType),
 ('PREV_BIRTH_PRETERM', StringType)]

In [14]:
for i, s in enumerate(cols):
    print(i,s,s[1])

0 ('INFANT_ALIVE_AT_REPORT', StringType) StringType
1 ('BIRTH_PLACE', StringType) StringType
2 ('MOTHER_AGE_YEARS', IntegerType) IntegerType
3 ('FATHER_COMBINED_AGE', IntegerType) IntegerType
4 ('CIG_BEFORE', IntegerType) IntegerType
5 ('CIG_1_TRI', IntegerType) IntegerType
6 ('CIG_2_TRI', IntegerType) IntegerType
7 ('CIG_3_TRI', IntegerType) IntegerType
8 ('MOTHER_HEIGHT_IN', IntegerType) IntegerType
9 ('MOTHER_PRE_WEIGHT', IntegerType) IntegerType
10 ('MOTHER_DELIVERY_WEIGHT', IntegerType) IntegerType
11 ('MOTHER_WEIGHT_GAIN', IntegerType) IntegerType
12 ('DIABETES_PRE', StringType) StringType
13 ('DIABETES_GEST', StringType) StringType
14 ('HYP_TENS_PRE', StringType) StringType
15 ('HYP_TENS_GEST', StringType) StringType
16 ('PREV_BIRTH_PRETERM', StringType) StringType


In [15]:
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        print(births.select(s[0]).distinct().rdd.collect()) 

[Row(INFANT_ALIVE_AT_REPORT='Y'), Row(INFANT_ALIVE_AT_REPORT='N')]
[Row(BIRTH_PLACE='7'), Row(BIRTH_PLACE='3'), Row(BIRTH_PLACE='5'), Row(BIRTH_PLACE='6'), Row(BIRTH_PLACE='9'), Row(BIRTH_PLACE='1'), Row(BIRTH_PLACE='4'), Row(BIRTH_PLACE='2')]
[Row(DIABETES_PRE='Y'), Row(DIABETES_PRE='U'), Row(DIABETES_PRE='N')]
[Row(DIABETES_GEST='Y'), Row(DIABETES_GEST='U'), Row(DIABETES_GEST='N')]
[Row(HYP_TENS_PRE='Y'), Row(HYP_TENS_PRE='U'), Row(HYP_TENS_PRE='N')]
[Row(HYP_TENS_GEST='Y'), Row(HYP_TENS_GEST='U'), Row(HYP_TENS_GEST='N')]
[Row(PREV_BIRTH_PRETERM='Y'), Row(PREV_BIRTH_PRETERM='U'), Row(PREV_BIRTH_PRETERM='N')]


 - births.select(s[0]) : 컬럼명으로 데이터 가지고오기. 
 - distinct : 중복되지 않는 값만 가지고 오기. 
 - 해당 결과값이 DataFrame형태로 되어있어 RDD로 변형이 필요하다. .RDD
 - 각 해당되는 Column들의 데이터를 추출 map(row:row[0]).collect()

In [16]:
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        print(births.select(s[0]).distinct().rdd.map(lambda row:row[0]).collect())

['Y', 'N']
['7', '3', '5', '6', '9', '1', '4', '2']
['Y', 'U', 'N']
['Y', 'U', 'N']
['Y', 'U', 'N']
['Y', 'U', 'N']
['Y', 'U', 'N']


In [17]:
YNU_cols = []
for i, s in enumerate(cols):
    if s[1] == typ.StringType():
        dis = births.select(s[0]).distinct().rdd.map(lambda row:row[0]).collect()
        if 'Y' in dis: # 출력된 값들 중 Y가 포함된다면 해당 값들을 LIST에 Column name을 Append
            YNU_cols.append(s[0])

In [18]:
YNU_cols

['INFANT_ALIVE_AT_REPORT',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [19]:
births.select([
        'INFANT_NICU_ADMISSION', 
        rec_integer(
            'INFANT_NICU_ADMISSION', func.lit('YNU')
        ) \
        .alias('INFANT_NICU_ADMISSION_RECODE')]
     ).take(5) # Y일 경우 1 나머지는 모두 0으로 변환. 


[Row(INFANT_NICU_ADMISSION='Y', INFANT_NICU_ADMISSION_RECODE=1),
 Row(INFANT_NICU_ADMISSION='Y', INFANT_NICU_ADMISSION_RECODE=1),
 Row(INFANT_NICU_ADMISSION='U', INFANT_NICU_ADMISSION_RECODE=0),
 Row(INFANT_NICU_ADMISSION='N', INFANT_NICU_ADMISSION_RECODE=0),
 Row(INFANT_NICU_ADMISSION='U', INFANT_NICU_ADMISSION_RECODE=0)]

In [20]:
exprs_YNU = [
    rec_integer(x, func.lit('YNU')). alias(x)
    if x in YNU_cols else x for x in births_trimmed.columns
]

birth_transformed = birth_transformed.select(exprs_YNU)

 - Let's Check if we got it Collectly

In [21]:
birth_transformed.select(YNU_cols[-5:]).show(5)

+------------+-------------+------------+-------------+------------------+
|DIABETES_PRE|DIABETES_GEST|HYP_TENS_PRE|HYP_TENS_GEST|PREV_BIRTH_PRETERM|
+------------+-------------+------------+-------------+------------------+
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 0|
|           0|            0|           0|            0|                 1|
|           0|            0|           0|            0|                 0|
+------------+-------------+------------+-------------+------------------+
only showing top 5 rows



# 5.2 Getting to know your data
## 5.2.1 Descriptive statistics 
 - DataFrames exose the .describe method
 - with LMlib, we will use the .colStats(...) method 
 - RDD of data to calculate the descripticve statistics of and return a MultivariateStatisticalSummary object that contains the following descriptive statistics
  - count()
  - max()
  - mean()
  - min()
  - normL1() : value of the L1-Norm for the values in the column
  - normL2() : value of the L2-Norm for the values in the column
  - numNonZeros() : nunmber of nonzero values in the column
  - variance()

In [24]:
import pyspark.mllib.stat as st
import numpy as np 

In [25]:
numeric_cols = ['MOTHER_AGE_YEARS','FATHER_COMBINED_AGE',
                'CIG_BEFORE','CIG_1_TRI','CIG_2_TRI','CIG_3_TRI',
                'MOTHER_HEIGHT_IN','MOTHER_PRE_WEIGHT',
                'MOTHER_DELIVERY_WEIGHT','MOTHER_WEIGHT_GAIN'
               ]

In [26]:
numeric_rdd = birth_transformed.select(numeric_cols).rdd.map(lambda row: [e for e in row])

In [27]:
numeric_rdd.take(2)

[[29, 99, 0, 0, 0, 0, 99, 999, 999, 99],
 [22, 29, 0, 0, 0, 0, 65, 180, 198, 18]]

In [28]:
mllib_stats = st.Statistics.colStats(numeric_rdd)

 - 해당 숫자형 변수들의 평균과 표준편차를 출력
  - variance를 출력하여 sqrt를 씌워줬다.

In [29]:
for col, m, v in zip(numeric_cols,mllib_stats.mean(), mllib_stats.variance()):
    print('{0} : \t {1:.2f} \t {2:.2f}'.format(col, m, np.sqrt(v)))

MOTHER_AGE_YEARS : 	 28.30 	 6.08
FATHER_COMBINED_AGE : 	 44.55 	 27.55
CIG_BEFORE : 	 1.43 	 5.18
CIG_1_TRI : 	 0.91 	 3.83
CIG_2_TRI : 	 0.70 	 3.31
CIG_3_TRI : 	 0.58 	 3.11
MOTHER_HEIGHT_IN : 	 65.12 	 6.45
MOTHER_PRE_WEIGHT : 	 214.50 	 210.21
MOTHER_DELIVERY_WEIGHT : 	 223.63 	 180.01
MOTHER_WEIGHT_GAIN : 	 30.74 	 26.23


 - For the categorical variables, we will calculate the frequencies of their values

In [30]:
categorical_cols = [e for e in birth_transformed.columns if e not in numeric_cols]
categorical_cols # Categorical Columns

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

In [35]:
birth_transformed.select(categorical_cols).rdd.take(5)

[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0),
 Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0),
 Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0),
 Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=1),
 Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0)]

 - 각각의 Columns에서 값을 출력해서 가지고 있는다. list형태로 출력.

In [42]:
categorical_rdd = birth_transformed.select(categorical_cols).rdd.map(lambda row: [e for e in row])

In [43]:
categorical_rdd.take(5) 

[[0, '1', 0, 0, 0, 0, 0],
 [0, '1', 0, 0, 0, 0, 0],
 [0, '1', 0, 0, 0, 0, 0],
 [0, '1', 0, 0, 0, 0, 1],
 [0, '1', 0, 0, 0, 0, 0]]

In [47]:
for i, col in enumerate(categorical_cols):
    print(i, col)

0 INFANT_ALIVE_AT_REPORT
1 BIRTH_PLACE
2 DIABETES_PRE
3 DIABETES_GEST
4 HYP_TENS_PRE
5 HYP_TENS_GEST
6 PREV_BIRTH_PRETERM


In [52]:
for i, col in enumerate(categorical_cols):
    agg = categorical_rdd.groupBy(lambda row:row[i]).map(lambda row:(row[0],len(row[1])))
    print(col,"\n",sorted(agg.collect(),key=lambda el:el[1],reverse=True))

INFANT_ALIVE_AT_REPORT 
 [(1, 23349), (0, 22080)]
BIRTH_PLACE 
 [('1', 44558), ('4', 327), ('3', 224), ('2', 136), ('7', 91), ('5', 74), ('6', 11), ('9', 8)]
DIABETES_PRE 
 [(0, 44881), (1, 548)]
DIABETES_GEST 
 [(0, 43451), (1, 1978)]
HYP_TENS_PRE 
 [(0, 44348), (1, 1081)]
HYP_TENS_GEST 
 [(0, 43302), (1, 2127)]
PREV_BIRTH_PRETERM 
 [(0, 43088), (1, 2341)]


## 5.2.2. Correlations
 - Correlation between our features

In [53]:
corrs = st.Statistics.corr(numeric_rdd)

In [87]:
for i, el in enumerate(corrs>0.5): # 각 Correlation Matrix에서 0.5이상인 필드만 선택.
    #print(i, el)
        print(i, el)
        #for j,e in enumerate(el): 
            #print(j, e)
            #if j != i and e == 1.0:
                #print(numeric_cols[j], corrs[i][j])

0 [ True False False False False False False False False False]
1 [False  True False False False False False False False False]
2 [False False  True  True  True  True False False False False]
3 [False False  True  True  True  True False False False False]
4 [False False  True  True  True  True False False False False]
5 [False False  True  True  True  True False False False False]
6 [False False False False False False  True False False False]
7 [False False False False False False False  True  True  True]
8 [False False False False False False False  True  True  True]
9 [False False False False False False False  True  True  True]


In [56]:
for i, el in enumerate(corrs > 0.5): # 각 Correlation Matrix에서 0.5이상인 필드만 선택.
    correlated = [
        (numeric_cols[j], corrs[i][j])
        for j,e in enumerate(el)
        if e == 1.0 and j != i] # 0.5 이상 필드가 있는 곳에서 해당 부분이 True인 곳.
    if len(correlated) > 0:
        for e in correlated:
            print('{0}-to-{1}: {2:.2f}'.format(numeric_cols[i], e[0], e[1]))
    

CIG_BEFORE-to-CIG_1_TRI: 0.83
CIG_BEFORE-to-CIG_2_TRI: 0.72
CIG_BEFORE-to-CIG_3_TRI: 0.62
CIG_1_TRI-to-CIG_BEFORE: 0.83
CIG_1_TRI-to-CIG_2_TRI: 0.87
CIG_1_TRI-to-CIG_3_TRI: 0.76
CIG_2_TRI-to-CIG_BEFORE: 0.72
CIG_2_TRI-to-CIG_1_TRI: 0.87
CIG_2_TRI-to-CIG_3_TRI: 0.89
CIG_3_TRI-to-CIG_BEFORE: 0.62
CIG_3_TRI-to-CIG_1_TRI: 0.76
CIG_3_TRI-to-CIG_2_TRI: 0.89
MOTHER_PRE_WEIGHT-to-MOTHER_DELIVERY_WEIGHT: 0.54
MOTHER_PRE_WEIGHT-to-MOTHER_WEIGHT_GAIN: 0.65
MOTHER_DELIVERY_WEIGHT-to-MOTHER_PRE_WEIGHT: 0.54
MOTHER_DELIVERY_WEIGHT-to-MOTHER_WEIGHT_GAIN: 0.60
MOTHER_WEIGHT_GAIN-to-MOTHER_PRE_WEIGHT: 0.65
MOTHER_WEIGHT_GAIN-to-MOTHER_DELIVERY_WEIGHT: 0.60


### Check Highly Correlated features 
 - 'CIG...' features are highly correlated, so We can drop most of highly correlated features.
 - Since we want to predict the survival chances of an infant as soon as possible
 - Keep 'MOTHER_PRE_WEIGHT' feature 

In [88]:
features_to_keep = [
    'INFANT_ALIVE_AT_REPORT', 
    'BIRTH_PLACE', 
    'MOTHER_AGE_YEARS', 
    'FATHER_COMBINED_AGE', 
    'CIG_1_TRI', 
    'MOTHER_HEIGHT_IN', 
    'MOTHER_PRE_WEIGHT', 
    'DIABETES_PRE', 
    'DIABETES_GEST', 
    'HYP_TENS_PRE', 
    'HYP_TENS_GEST', 
    'PREV_BIRTH_PRETERM'
]

birth_transformed = birth_transformed.select([e for e in features_to_keep])

In [90]:
birth_transformed.take(2)

[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=29, FATHER_COMBINED_AGE=99, CIG_1_TRI=0, MOTHER_HEIGHT_IN=99, MOTHER_PRE_WEIGHT=999, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0),
 Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=22, FATHER_COMBINED_AGE=29, CIG_1_TRI=0, MOTHER_HEIGHT_IN=65, MOTHER_PRE_WEIGHT=180, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0)]

### Statistical testing
 - Run a Chi-square test to determine if there are significant differences for categorical variables.
  - how you can do it using the .chiSqTest(...) method of MLlib

In [91]:
import pyspark.mllib.linalg as ln

In [93]:
categorical_cols

['INFANT_ALIVE_AT_REPORT',
 'BIRTH_PLACE',
 'DIABETES_PRE',
 'DIABETES_GEST',
 'HYP_TENS_PRE',
 'HYP_TENS_GEST',
 'PREV_BIRTH_PRETERM']

 - INFANT_ALIVE_AT_REPORT 의 값은 0 또는 1 이다.
  - groupBy을 하게 된다면 0과 1로 각각 Groupby될 것이다.

In [103]:
birth_transformed.select(categorical_cols[0]).rdd.distinct().collect()

[Row(INFANT_ALIVE_AT_REPORT=0), Row(INFANT_ALIVE_AT_REPORT=1)]

In [98]:
for cat in categorical_cols[1:]:
    print(cat)
    agg = birth_transformed\
        .groupBy(categorical_cols[0])\
        .pivot(cat)\
        .count()
    print(agg.collect())

BIRTH_PLACE
[Row(INFANT_ALIVE_AT_REPORT=1, 1=22995, 2=113, 3=158, 4=39, 5=19, 6=2, 7=23, 9=None), Row(INFANT_ALIVE_AT_REPORT=0, 1=21563, 2=23, 3=66, 4=288, 5=55, 6=9, 7=68, 9=8)]
DIABETES_PRE
[Row(INFANT_ALIVE_AT_REPORT=1, 0=23178, 1=171), Row(INFANT_ALIVE_AT_REPORT=0, 0=21703, 1=377)]
DIABETES_GEST
[Row(INFANT_ALIVE_AT_REPORT=1, 0=22014, 1=1335), Row(INFANT_ALIVE_AT_REPORT=0, 0=21437, 1=643)]
HYP_TENS_PRE
[Row(INFANT_ALIVE_AT_REPORT=1, 0=22918, 1=431), Row(INFANT_ALIVE_AT_REPORT=0, 0=21430, 1=650)]
HYP_TENS_GEST
[Row(INFANT_ALIVE_AT_REPORT=1, 0=22135, 1=1214), Row(INFANT_ALIVE_AT_REPORT=0, 0=21167, 1=913)]
PREV_BIRTH_PRETERM
[Row(INFANT_ALIVE_AT_REPORT=1, 0=22685, 1=664), Row(INFANT_ALIVE_AT_REPORT=0, 0=20403, 1=1677)]


In [102]:
for cat in categorical_cols[1:]:
    
    agg = birth_transformed\
        .groupBy(categorical_cols[0])\
        .pivot(cat)\
        .count()
    print(agg.rdd.map(lambda row : (row[1:])).collect())
    # 각각 Tuple의 List로 구성된 것을 펼쳐 주기 위해서 flatMap을 사용.

[(22995, 113, 158, 39, 19, 2, 23, None), (21563, 23, 66, 288, 55, 9, 68, 8)]
[(23178, 171), (21703, 377)]
[(22014, 1335), (21437, 643)]
[(22918, 431), (21430, 650)]
[(22135, 1214), (21167, 913)]
[(22685, 664), (20403, 1677)]


In [99]:
for cat in categorical_cols[1:]:
    
    agg = birth_transformed\
        .groupBy(categorical_cols[0])\
        .pivot(cat)\
        .count()
    
    agg_rdd = agg.rdd.map(lambda row: (row[1:]))\
            .flatMap(lambda row: [0 if e == None else e for e in row])\
            .collect()
            
    print(agg_rdd)

[22995, 113, 158, 39, 19, 2, 23, 0, 21563, 23, 66, 288, 55, 9, 68, 8]
[23178, 171, 21703, 377]
[22014, 1335, 21437, 643]
[22918, 431, 21430, 650]
[22135, 1214, 21167, 913]
[22685, 664, 20403, 1677]


 - we transform them into an RDD, so we can then convert them into a matrix using the pyspark.mllib.linalg as ln
 - $.Matrices.dense(...)$ specifies the number of rows in the matrix

In [104]:
print(ln.Matrices.dense(3,2,[1,2,3,4,5,6])) # 3행 x 2열 행렬로 변환

DenseMatrix([[ 1.,  4.],
             [ 2.,  5.],
             [ 3.,  6.]])


In [105]:
for cat in categorical_cols[1:]:
    # pivot categorical variables by the 'INFANT_ALIVE_AT_REPORT' to get Count()
    agg = birth_transformed\
        .groupBy(categorical_cols[0])\
        .pivot(cat)\
        .count() 
    # pivot 을 통해서 Count()한 값들을 기준이 된 INFANT값을 제외하고 가지고온다.
    # 가지고 올때 값이 None인 부분을 0으로 변환하여 전부 펼처서 출력.
    agg_rdd = agg.rdd.map(lambda row: (row[1:]))\
            .flatMap(lambda row: [0 if e == None else e for e in row])\
            .collect()
            
    row_length = len(agg.collect()[0]) - 1 # 데이터의 크기.
    agg = ln.Matrices.dense(row_length, 2, agg_rdd)
    print(agg)
    test = st.Statistics.chiSqTest(agg)
    print(cat, round(test.pValue, 4))

DenseMatrix([[  2.29950000e+04,   2.15630000e+04],
             [  1.13000000e+02,   2.30000000e+01],
             [  1.58000000e+02,   6.60000000e+01],
             [  3.90000000e+01,   2.88000000e+02],
             [  1.90000000e+01,   5.50000000e+01],
             [  2.00000000e+00,   9.00000000e+00],
             [  2.30000000e+01,   6.80000000e+01],
             [  0.00000000e+00,   8.00000000e+00]])
BIRTH_PLACE 0.0
DenseMatrix([[ 23178.,  21703.],
             [   171.,    377.]])
DIABETES_PRE 0.0
DenseMatrix([[ 22014.,  21437.],
             [  1335.,    643.]])
DIABETES_GEST 0.0
DenseMatrix([[ 22918.,  21430.],
             [   431.,    650.]])
HYP_TENS_PRE 0.0
DenseMatrix([[ 22135.,  21167.],
             [  1214.,    913.]])
HYP_TENS_GEST 0.0
DenseMatrix([[ 22685.,  20403.],
             [   664.,   1677.]])
PREV_BIRTH_PRETERM 0.0


### Our tests reveal that all the features should be significantly different and should help us predict the chance of survival of an infant.

### 5.2.3 Creating the final dataset
 - convert our DataFrame into an RDD of LabeledPoints .
 - A LabeledPoint is a MLlib structure that is used to train the machine learning models. It consists of two attributes: label and features .

### 5.2.4 Creating an RDD of LabeledPoints
 - we first need to deal with one final obstacle: our 'BIRTH_PLACE' feature is still a string.
 - p.90

In [106]:
import pyspark.mllib.feature as ft
import pyspark.mllib.regression as reg

In [113]:
hashing = ft.HashingTF(7)
births_hashed = birth_transformed.rdd.map(lambda row: [
    list(hashing.transform(row[1]).toArray()) if col == 'BIRTH_PLACE'
    else row[i]
    for i, col in enumerate(features_to_keep)])\
     .map(lambda row: [[e] if type(e) == int else e for e in row])\
     .map(lambda row: [item for sublist in row for item in sublist])\
     .map(lambda row: reg.LabeledPoint(
         row[0],ln.Vectors.dense(row[1:])))

### Split into training and testing

In [114]:
births_train, births_test = births_hashed.randomSplit([0.6,0.4])

### Predicting infant survival
#### Logistic regression in Spark
MLLib used to provide a logistic regression model estimated using a stochastic gradient descent (SGD) algorithm. This model has been deprecated in Spark 2.0 in favor of the LogisticRegressionWithLBFGS model.

In [115]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

In [117]:
LR_Model = LogisticRegressionWithLBFGS.train(births_train, iterations=10)

In [120]:
LR_results = (
    births_test.map(lambda row:row.label)\
    .zip(LR_Model.predict(births_test.map(lambda row: row.features)))
).map(lambda row:(row[0],row[1] * 1.0))

Let's check how well or how bad our model performed.

In [121]:
import pyspark.mllib.evaluation as ev
LR_evaluation = ev.BinaryClassificationMetrics(LR_results)

print('Area under PR: {0:.2f}' \
      .format(LR_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(LR_evaluation.areaUnderROC))
LR_evaluation.unpersist()

Area under PR: 0.85
Area under ROC: 0.63


### Selecting only the most predictable features
MLLib allows us to select the most predictable features using a Chi-Square selector.

In [122]:
selector = ft.ChiSqSelector(4).fit(births_train)

topFeatures_train = (
        births_train.map(lambda row: row.label) \
        .zip(selector \
             .transform(births_train \
                        .map(lambda row: row.features)))
    ).map(lambda row: reg.LabeledPoint(row[0], row[1]))

topFeatures_test = (
        births_test.map(lambda row: row.label) \
        .zip(selector \
             .transform(births_test \
                        .map(lambda row: row.features)))
    ).map(lambda row: reg.LabeledPoint(row[0], row[1]))

### Random Forest in Spark

In [123]:
from pyspark.mllib.tree import RandomForest

In [124]:
RF_model = RandomForest \
    .trainClassifier(data=topFeatures_train, 
                     numClasses=2, 
                     categoricalFeaturesInfo={}, 
                     numTrees=6,  
                     featureSubsetStrategy='all',
                     seed=666)

In [125]:
RF_results = (
        topFeatures_test.map(lambda row: row.label) \
        .zip(RF_model \
             .predict(topFeatures_test \
                      .map(lambda row: row.features)))
    )

RF_evaluation = ev.BinaryClassificationMetrics(RF_results)

print('Area under PR: {0:.2f}' \
      .format(RF_evaluation.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(RF_evaluation.areaUnderROC))
RF_evaluation.unpersist()

Area under PR: 0.83
Area under ROC: 0.62


Let's see how the logistic regression would perform with reduced number of features.

In [126]:
LR_Model_2 = LogisticRegressionWithLBFGS \
    .train(topFeatures_train, iterations=10)

LR_results_2 = (
        topFeatures_test.map(lambda row: row.label) \
        .zip(LR_Model_2 \
             .predict(topFeatures_test \
                      .map(lambda row: row.features)))
    ).map(lambda row: (row[0], row[1] * 1.0))

LR_evaluation_2 = ev.BinaryClassificationMetrics(LR_results_2)

print('Area under PR: {0:.2f}' \
      .format(LR_evaluation_2.areaUnderPR))
print('Area under ROC: {0:.2f}' \
      .format(LR_evaluation_2.areaUnderROC))
LR_evaluation_2.unpersist()

Area under PR: 0.87
Area under ROC: 0.62
