In [71]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
filename = "bank-full.csv"
target_variable_name = "y"

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df = spark.read.option('header',True).option('inferSchema', True).csv('/home/phongbui/Desktop/Applied DS with pyspark/applied-data-science-using-pyspark/Ch04/bank-full.csv', sep = ';')

In [5]:
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [6]:
df.count()

45211

In [7]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [8]:
df.describe().show()

+-------+------------------+-------+--------+---------+-------+------------------+-------+-----+--------+-----------------+-----+-----------------+------------------+------------------+------------------+--------+-----+
|summary|               age|    job| marital|education|default|           balance|housing| loan| contact|              day|month|         duration|          campaign|             pdays|          previous|poutcome|    y|
+-------+------------------+-------+--------+---------+-------+------------------+-------+-----+--------+-----------------+-----+-----------------+------------------+------------------+------------------+--------+-----+
|  count|             45211|  45211|   45211|    45211|  45211|             45211|  45211|45211|   45211|            45211|45211|            45211|             45211|             45211|             45211|   45211|45211|
|   mean| 40.93621021432837|   null|    null|     null|   null|1362.2720576850766|   null| null|    null|15.806418791886

# Cardinality Check

In [9]:
from pyspark.sql.functions import approxCountDistinct, countDistinct

"""
Note: approxCountDistinct and countDistinct can be used interchangeably.
Only difference is the computation time.
"approxCountDistinct" is useful for large datasets
"countDistinct" for small and medium datasets.
"""

'\nNote: approxCountDistinct and countDistinct can be used interchangeably.\nOnly difference is the computation time.\n"approxCountDistinct" is useful for large datasets\n"countDistinct" for small and medium datasets.\n'

In [10]:
df.select(*[approxCountDistinct(_).alias(_) for _ in df.columns]).show() # Unpack lists values with *

+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 76| 11|      3|        4|      2|   7375|      2|   2|      3| 32|   12|    1605|      47|  547|      42|       4|  2|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+



In [11]:
df.select(*[countDistinct(_).alias(_) for _ in df.columns]).show() # Unpack lists values with *

+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 77| 12|      3|        4|      2|   7168|      2|   2|      3| 31|   12|    1573|      48|  559|      41|       4|  2|
+---+---+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+



# Step 1: Identify Variable Types

In [12]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [13]:
char_vars = [_[0] for _ in df.dtypes if _[1] == 'string']

In [14]:
char_vars

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [15]:
num_vars = [_[0] for _ in df.dtypes if _[1] == 'int']

In [16]:
num_vars

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Step 2: Apply StringIndexer to Character Columns

In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

### Manual Way

In [18]:
indexers = [StringIndexer(inputCol = _, outputCol = _ + "_index", handleInvalid = "keep") for _ in char_vars]

In [19]:
indexers

[StringIndexer_a7c577bd0619,
 StringIndexer_6e3db372b118,
 StringIndexer_64128460bcc7,
 StringIndexer_ba29c4f066f6,
 StringIndexer_035a07185d1b,
 StringIndexer_d509a06c5214,
 StringIndexer_615d9c7a5284,
 StringIndexer_eabf526dc3ed,
 StringIndexer_5c0e34681840,
 StringIndexer_14840d80d8ce]

In [20]:
pipeline = Pipeline(stages = indexers) # Pipeline will rin StringIndexer operations sequentially (Will run the list indexers sequetially)

In [21]:
fit = pipeline.fit(df)

In [22]:
transformed_df = fit.transform(df)

In [23]:
transformed_df.show(10)

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|job_index|marital_index|education_index|default_index|housing_index|loan_index|contact_index|month_index|poutcome_index|y_index|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|      1.0|          0.0|            1.0|          0.0|          0.0|       0.0|          1.0|

### Function Way

In [24]:
#converts each category column to index
def category_to_index(df, char_vars):
    
    char_df = df.select(char_vars)
    indexers = [StringIndexer(inputCol = c, outputCol = c + "_index", handleInvalid = "keep") for c in char_df.columns]
    pipeline = Pipeline(stages = indexers)
    fitting = pipeline.fit(char_df)
    df = fitting.transform(df)
    return df

In [25]:
#apply category_to_index function on our DataFrame
df = category_to_index(df, char_vars)

In [26]:
df.show(5, False)

+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------+
|age|job         |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|y  |job_index|marital_index|education_index|default_index|housing_index|loan_index|contact_index|month_index|poutcome_index|y_index|
+---+------------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---------+-------------+---------------+-------------+-------------+----------+-------------+-----------+--------------+-------+
|58 |management  |married|tertiary |no     |2143   |yes    |no  |unknown|5  |may  |261     |1       |-1   |0       |unknown |no |1.0      |0.0          |1.0            |0.0          |0.0          |0.0       |1.0          |0.0 

### Renaming and Replacing Columns

In [27]:
char_vars

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [28]:
df = df.select([c for c in df.columns if c not in char_vars]) ### Replace the Categorical Columns with Number Columns (StringIndexer)

In [29]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- balance: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- job_index: double (nullable = false)
 |-- marital_index: double (nullable = false)
 |-- education_index: double (nullable = false)
 |-- default_index: double (nullable = false)
 |-- housing_index: double (nullable = false)
 |-- loan_index: double (nullable = false)
 |-- contact_index: double (nullable = false)
 |-- month_index: double (nullable = false)
 |-- poutcome_index: double (nullable = false)
 |-- y_index: double (nullable = false)



In [30]:
for _ in df.columns:
    if _.endswith('_index'):
        df = df.withColumnRenamed(_, _[0:-6])
        
df.columns

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [31]:
df.show(10)

+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+
|age|balance|day|duration|campaign|pdays|previous| job|marital|education|default|housing|loan|contact|month|poutcome|  y|
+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+
| 58|   2143|  5|     261|       1|   -1|       0| 1.0|    0.0|      1.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|0.0|
| 44|     29|  5|     151|       1|   -1|       0| 2.0|    1.0|      0.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|0.0|
| 33|      2|  5|      76|       1|   -1|       0| 7.0|    0.0|      0.0|    0.0|    0.0| 1.0|    1.0|  0.0|     0.0|0.0|
| 47|   1506|  5|      92|       1|   -1|       0| 0.0|    0.0|      3.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|0.0|
| 33|      1|  5|     198|       1|   -1|       0|11.0|    1.0|      3.0|    0.0|    1.0| 0.0|    1.0|  0.0|     0.0|0.0|
| 35|    231|  5|     13

# Step 3: Assemble Features

The last step is to assemble the individual variables into a single feature vector. This is useful because, instead of providing individual variables in the next steps, we can point to one variable. In addition, you can optionally scale the DataFrame using a StandardScaler or MinMaxScaler. This is accomplished using the following code:

### Manual Way

In [32]:
#assemble features into one vector
from pyspark.ml.feature import VectorAssembler

In [33]:
columns_list = df.columns

In [34]:
columns_list.remove('y')

In [35]:
columns_list

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [36]:
assembler = VectorAssembler(inputCols = columns_list, outputCol = 'features')

In [37]:
transformed_df = assembler.transform(df)

In [38]:
transformed_df.show(10, False)

+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+-----------------------------------------------------------------------+
|age|balance|day|duration|campaign|pdays|previous|job |marital|education|default|housing|loan|contact|month|poutcome|y  |features                                                               |
+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+-----------------------------------------------------------------------+
|58 |2143   |5  |261     |1       |-1   |0       |1.0 |0.0    |1.0      |0.0    |0.0    |0.0 |1.0    |0.0  |0.0     |0.0|(16,[0,1,2,3,4,5,7,9,13],[58.0,2143.0,5.0,261.0,1.0,-1.0,1.0,1.0,1.0]) |
|44 |29     |5  |151     |1       |-1   |0       |2.0 |1.0    |0.0      |0.0    |0.0    |0.0 |1.0    |0.0  |0.0     |0.0|(16,[0,1,2,3,4,5,7,8,13],[44.0,29.0,5.0,151.0,1.0,-1.0,2.0,1.0,1.0])   |
|33 |2      |5  |76      |1   

In [39]:
transformed_df.select('features','y').show(10, False)

+-----------------------------------------------------------------------+---+
|features                                                               |y  |
+-----------------------------------------------------------------------+---+
|(16,[0,1,2,3,4,5,7,9,13],[58.0,2143.0,5.0,261.0,1.0,-1.0,1.0,1.0,1.0]) |0.0|
|(16,[0,1,2,3,4,5,7,8,13],[44.0,29.0,5.0,151.0,1.0,-1.0,2.0,1.0,1.0])   |0.0|
|(16,[0,1,2,3,4,5,7,12,13],[33.0,2.0,5.0,76.0,1.0,-1.0,7.0,1.0,1.0])    |0.0|
|(16,[0,1,2,3,4,5,9,13],[47.0,1506.0,5.0,92.0,1.0,-1.0,3.0,1.0])        |0.0|
|[33.0,1.0,5.0,198.0,1.0,-1.0,0.0,11.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0] |0.0|
|(16,[0,1,2,3,4,5,7,9,13],[35.0,231.0,5.0,139.0,1.0,-1.0,1.0,1.0,1.0])  |0.0|
|[28.0,447.0,5.0,217.0,1.0,-1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0]|0.0|
|[42.0,2.0,5.0,380.0,1.0,-1.0,0.0,7.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0]  |0.0|
|(16,[0,1,2,3,4,5,7,9,13],[58.0,121.0,5.0,50.0,1.0,-1.0,5.0,2.0,1.0])   |0.0|
|(16,[0,1,2,3,4,5,7,8,13],[43.0,593.0,5.0,55.0,1.0,-1.0,2.0,1.0,

# Principal Component Analysis

In [40]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

In [41]:
pca = PCA(k = 3, inputCol = "features", outputCol = "pcaFeatures")

In [42]:
model = pca.fit(transformed_df)

In [43]:
model.transform(transformed_df).select("pcaFeatures").show(10,False)

+------------------------------------------------------------+
|pcaFeatures                                                 |
+------------------------------------------------------------+
|[-2143.4953647735806,-257.0420740676509,1.2449712753045807] |
|[-29.29226175164622,-150.92888640669173,1.0493026899277984] |
|[-2.150889773941845,-75.98191250436618,1.076065195538016]   |
|[-1506.1823305908113,-89.21547154672724,1.2683999536975388] |
|[-1.3750908349447704,-197.98356175494834,0.9892427009814176]|
|[-231.26679712386607,-138.56034919503722,1.0576960891394662]|
|[-447.4072782675638,-216.16541383839757,1.0066666548029033] |
|[-2.7123264447441384,-379.9785828164356,0.875150469836411]  |
|[-121.11144848214039,-49.756360259707854,1.1725853591922941]|
|[-593.1146061641073,-53.89364261832045,1.1817022732261329]  |
+------------------------------------------------------------+
only showing top 10 rows



In [44]:
model.pc.toArray()

array([[-3.41021399e-04,  2.79524640e-04,  2.58353293e-03],
       [-9.99998245e-01,  1.83654726e-03,  1.13892524e-04],
       [-1.22934480e-05,  9.79995613e-04,  7.79347982e-03],
       [-1.83671689e-03, -9.99996986e-01, -7.36955549e-04],
       [ 1.48468991e-05,  1.01391994e-03,  2.75121381e-03],
       [-1.13085547e-04,  7.49207153e-04, -9.99889046e-01],
       [-1.26153895e-05, -6.36100089e-06, -1.04654388e-02],
       [-1.78789640e-05, -4.11817349e-05,  5.51411389e-04],
       [ 6.41085932e-06, -5.23364803e-05, -1.45349520e-04],
       [-1.11185424e-05,  1.30366514e-05,  2.01500982e-04],
       [ 2.91665702e-06,  4.42643869e-06,  3.95562163e-05],
       [-1.12221341e-05,  1.26153926e-05,  6.17569266e-04],
       [ 1.01623400e-05,  1.50687571e-05,  8.23933054e-05],
       [-5.68377754e-07,  6.95393403e-05,  1.03951369e-03],
       [-7.60886236e-05, -1.16754927e-04, -3.24662847e-03],
       [-8.55162111e-06, -6.01853226e-05, -4.94522998e-03]])

In [45]:
model.explainedVariance

DenseVector([0.9918, 0.0071, 0.0011])

In [46]:
# The first principal component explains 99 percent of the variance in the data. 
# This is because we did not scale our DataFrame before we used PCA. 
# So, let us do one more run with StandardScaler before using PCA. You will notice a big change in the variance explained plot.

# Feature Scaling

In [47]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [48]:
def scaled_assemble_vectors(df, features_list, target_variable_name):
    stages = []
    #assemble vectors
    assembler = VectorAssembler(inputCols = features_list, outputCol = 'assembled_features')
    scaler = StandardScaler(inputCol = assembler.getOutputCol(), outputCol = 'features2')
    stages = [assembler, scaler]
    #select all the columns + target + newly created 'features' column
    selectedCols = [target_variable_name, 'features2'] + features_list
    #use pipeline to process sequentially
    pipeline = Pipeline(stages = stages)
    #assembler model
    scaleAssembleModel = pipeline.fit(df)
    #apply assembler model on data
    df = scaleAssembleModel.transform(df).select(selectedCols)
    return df

In [49]:
features_list = df.columns
features_list.remove(target_variable_name) # Remove 'y'
transformed_df = scaled_assemble_vectors(df, features_list, target_variable_name)

In [50]:
transformed_df.show(2)

+---+--------------------+---+-------+---+--------+--------+-----+--------+---+-------+---------+-------+-------+----+-------+-----+--------+
|  y|           features2|age|balance|day|duration|campaign|pdays|previous|job|marital|education|default|housing|loan|contact|month|poutcome|
+---+--------------------+---+-------+---+--------+--------+-----+--------+---+-------+---------+-------+-------+----+-------+-----+--------+
|0.0|(16,[0,1,2,3,4,5,...| 58|   2143|  5|     261|       1|   -1|       0|1.0|    0.0|      1.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|
|0.0|(16,[0,1,2,3,4,5,...| 44|     29|  5|     151|       1|   -1|       0|2.0|    1.0|      0.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|
+---+--------------------+---+-------+---+--------+--------+-----+--------+---+-------+---------+-------+-------+----+-------+-----+--------+
only showing top 2 rows



# ChiSq Selector

In [51]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

In [52]:
columns_list

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [53]:
char_vars[0:-1]

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [54]:
### Assemble Categorical Columns again
assembler = VectorAssembler(inputCols = char_vars[0:-1], outputCol = 'features')

In [55]:
transformed_df = assembler.transform(df)

In [56]:
selector = ChiSqSelector(numTopFeatures = 6, featuresCol = "features", outputCol = "selectedFeatures", labelCol = "y")

In [57]:
fitting = selector.fit(transformed_df)

In [58]:
fitting.getFeaturesCol()

'features'

In [59]:
result = fitting.transform(transformed_df)

In [60]:
result.show(10,False)

+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+--------------------------------------+--------------------------+
|age|balance|day|duration|campaign|pdays|previous|job |marital|education|default|housing|loan|contact|month|poutcome|y  |features                              |selectedFeatures          |
+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+--------------------------------------+--------------------------+
|58 |2143   |5  |261     |1       |-1   |0       |1.0 |0.0    |1.0      |0.0    |0.0    |0.0 |1.0    |0.0  |0.0     |0.0|(9,[0,2,6],[1.0,1.0,1.0])             |(6,[0,2,5],[1.0,1.0,1.0]) |
|44 |29     |5  |151     |1       |-1   |0       |2.0 |1.0    |0.0      |0.0    |0.0    |0.0 |1.0    |0.0  |0.0     |0.0|(9,[0,1,6],[2.0,1.0,1.0])             |(6,[0,1,5],[2.0,1.0,1.0]) |
|33 |2      |5  |76      |1       |-1   |0       |7.0 |0.0  

In [61]:
print("ChiSqSelector output with top %d features selected" % fitting.getNumTopFeatures())
print("Selected Indices: ", fitting.selectedFeatures)

ChiSqSelector output with top 6 features selected
Selected Indices:  [0, 1, 2, 4, 5, 6]


# Model-based Feature Selection

In [62]:
### Assemble Categorical Columns again
assembler = VectorAssembler(inputCols = columns_list, outputCol = 'features')

df = assembler.transform(df)

In [63]:
df.show(10,False)

+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+-----------------------------------------------------------------------+
|age|balance|day|duration|campaign|pdays|previous|job |marital|education|default|housing|loan|contact|month|poutcome|y  |features                                                               |
+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+-----------------------------------------------------------------------+
|58 |2143   |5  |261     |1       |-1   |0       |1.0 |0.0    |1.0      |0.0    |0.0    |0.0 |1.0    |0.0  |0.0     |0.0|(16,[0,1,2,3,4,5,7,9,13],[58.0,2143.0,5.0,261.0,1.0,-1.0,1.0,1.0,1.0]) |
|44 |29     |5  |151     |1       |-1   |0       |2.0 |1.0    |0.0      |0.0    |0.0    |0.0 |1.0    |0.0  |0.0     |0.0|(16,[0,1,2,3,4,5,7,8,13],[44.0,29.0,5.0,151.0,1.0,-1.0,2.0,1.0,1.0])   |
|33 |2      |5  |76      |1   

In [64]:
from pyspark.ml.classification import RandomForestClassifier

In [65]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'y')

In [66]:
rf_model = rf.fit(df)

In [67]:
rf_model.featureImportances

SparseVector(16, {0: 0.0172, 1: 0.0054, 2: 0.0049, 3: 0.3812, 4: 0.0048, 5: 0.0455, 6: 0.0157, 7: 0.0134, 8: 0.0033, 9: 0.0008, 10: 0.0001, 11: 0.0256, 12: 0.0004, 13: 0.0294, 14: 0.1986, 15: 0.2536})

In [68]:
rf_model.featureImportances.indices

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int32)

In [69]:
df.columns[0:-2]

['age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [72]:
pd.DataFrame({'Feature': df.columns[0:-2], 'Importance': list(rf_model.featureImportances)}).sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
3,duration,0.381243
15,poutcome,0.253645
14,month,0.198641
5,pdays,0.045504
13,contact,0.029361
11,housing,0.025571
0,age,0.017223
6,previous,0.015664
7,job,0.013421
1,balance,0.005368


# Information Value Using Weight of Evidence Page 124

# Main Concepts in Pipelines

In [73]:
# DataFrame: A collection of different datatypes organized in a single ML dataset
# Estimator: This is used to fit a function to a DataFrame. In RandomForestClassifier, the fit function is our estimator. 
# Transformer: This is used to transform DataFrame from one to another. Sometimes, a transformer requires a function to be fit before it can be applied. In RandomForestClassifier, the predict function is a transformer.
# Pipeline: An ML workflow or sequence of steps to be executed to get to the end result.
# Parameter: The estimator and transformer use uniform API for specifying parameters.

In [74]:
from pyspark.mllib.stat import Statistics
import pandas as pd

In [75]:
correlation_type = 'pearson' # 'pearson', 'spearman'

In [76]:
#transformer function
for k, v in df.schema["features"].metadata["ml_attr"]["attrs"].items():
    features_df = pd.DataFrame(v)

In [77]:
features_df

Unnamed: 0,vals,idx,name
0,"[blue-collar, management, technician, admin., ...",7,job
1,"[married, single, divorced, __unknown]",8,marital
2,"[secondary, tertiary, primary, unknown, __unkn...",9,education
3,"[no, yes, __unknown]",10,default
4,"[yes, no, __unknown]",11,housing
5,"[no, yes, __unknown]",12,loan
6,"[cellular, unknown, telephone, __unknown]",13,contact
7,"[may, jul, aug, jun, nov, apr, feb, jan, oct, ...",14,month
8,"[unknown, failure, other, success, __unknown]",15,poutcome


In [78]:
column_names = list(features_df['name'])

In [79]:
column_names

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [80]:
df.show()

+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+--------------------+
|age|balance|day|duration|campaign|pdays|previous| job|marital|education|default|housing|loan|contact|month|poutcome|  y|            features|
+---+-------+---+--------+--------+-----+--------+----+-------+---------+-------+-------+----+-------+-----+--------+---+--------------------+
| 58|   2143|  5|     261|       1|   -1|       0| 1.0|    0.0|      1.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|0.0|(16,[0,1,2,3,4,5,...|
| 44|     29|  5|     151|       1|   -1|       0| 2.0|    1.0|      0.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|0.0|(16,[0,1,2,3,4,5,...|
| 33|      2|  5|      76|       1|   -1|       0| 7.0|    0.0|      0.0|    0.0|    0.0| 1.0|    1.0|  0.0|     0.0|0.0|(16,[0,1,2,3,4,5,...|
| 47|   1506|  5|      92|       1|   -1|       0| 0.0|    0.0|      3.0|    0.0|    0.0| 0.0|    1.0|  0.0|     0.0|0.0|(16,[0,1,2,3,4,5,...|

In [81]:
df_vector = df.rdd.map(lambda x: x['features'].toArray())

In [82]:
matrix = Statistics.corr(df_vector, method = correlation_type)

In [83]:
matrix

array([[ 1.00000000e+00,  9.77827394e-02, -9.12004563e-03,
        -4.64842847e-03,  4.76031176e-03, -2.37580141e-02,
         1.28831920e-03,  7.74675608e-02, -1.26351157e-01,
         1.67296322e-01, -1.78793036e-02,  1.85513082e-01,
        -1.56552727e-02,  1.22114233e-01,  8.97166616e-02,
         1.22377857e-02],
       [ 9.77827394e-02,  1.00000000e+00,  4.50258513e-03,
         2.15603805e-02, -1.45782789e-02,  3.43532187e-03,
         1.66736367e-02,  2.04036974e-02, -2.81718574e-02,
         3.90672566e-02, -6.67450571e-02,  6.87683157e-02,
        -8.43502457e-02,  2.84401636e-03,  9.28525178e-02,
         3.72721258e-02],
       [-9.12004563e-03,  4.50258513e-03,  1.00000000e+00,
        -3.02063411e-02,  1.62490216e-01, -9.30440738e-02,
        -5.17104967e-02, -1.08739613e-02, -5.21670341e-03,
        -4.67502871e-03,  9.42389910e-03,  2.79816493e-02,
         1.13701576e-02, -6.30216620e-03, -3.80194619e-02,
        -7.26292919e-02],
       [-4.64842847e-03,  2.15603805e

In [84]:
corr_df = pd.DataFrame(matrix, columns = df.columns[0:-2], index = df.columns[0:-2])

In [85]:
corr_df

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job,marital,education,default,housing,loan,contact,month,poutcome
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288,0.077468,-0.126351,0.167296,-0.017879,0.185513,-0.015655,0.122114,0.089717,0.012238
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674,0.020404,-0.028172,0.039067,-0.066745,0.068768,-0.08435,0.002844,0.092853,0.037272
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171,-0.010874,-0.005217,-0.004675,0.009424,0.027982,0.01137,-0.006302,-0.038019,-0.072629
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203,0.00441,0.018834,-0.003029,-0.010021,-0.005075,-0.012412,-0.02935,0.014097,0.023192
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855,-0.02545,-0.029294,0.018643,0.016822,0.023599,0.00998,0.046971,-0.093829,-0.094982
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482,-0.020621,0.020832,-0.023103,-0.029979,-0.124178,-0.022754,-0.170654,0.130504,0.709008
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0,-0.002425,0.006876,-0.007463,-0.018329,-0.037076,-0.011043,-0.091911,0.124014,0.48504
job,0.077468,0.020404,-0.010874,0.00441,-0.02545,-0.020621,-0.002425,1.0,0.050184,-0.002546,-0.003261,0.175678,-0.035331,0.011959,0.116516,0.022384
marital,-0.126351,-0.028172,-0.005217,0.018834,-0.029294,0.020832,0.006876,0.050184,1.0,-0.061597,0.018404,0.011671,-0.018526,-0.038869,0.010914,0.020126
education,0.167296,0.039067,-0.004675,-0.003029,0.018643,-0.023103,-0.007463,-0.002546,-0.061597,1.0,-0.00752,0.076023,-0.068765,0.062967,0.028758,-0.010689


In [86]:
final_corr_df = pd.DataFrame(corr_df.abs().unstack().sort_values(kind='quicksort')).reset_index()
final_corr_df.rename({'level_0': 'col1', 'level_1': 'col2', 0: 'correlation_value'}, axis=1, inplace=True)
final_corr_df = final_corr_df[final_corr_df['col1'] != final_corr_df['col2']]
final_corr_df

Unnamed: 0,col1,col2,correlation_value
0,housing,poutcome,0.000527
1,poutcome,housing,0.000527
2,default,contact,0.000961
3,contact,default,0.000961
4,duration,previous,0.001203
...,...,...,...
235,previous,pdays,0.454820
236,poutcome,previous,0.485040
237,previous,poutcome,0.485040
238,poutcome,pdays,0.709008
