In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler, StringIndexer
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'fpmining'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [11]:
''' get the data '''
# load the data
fil = '../data/data-final.csv'
schem = StructType([StructField('EXT1', IntegerType()), StructField('EXT2', IntegerType()), StructField('EXT3', IntegerType()),
StructField('EXT4', IntegerType()), StructField('EXT5', IntegerType()), StructField('EXT6', IntegerType()),
StructField('EXT7', IntegerType()), StructField('EXT8', IntegerType()), StructField('EXT9', IntegerType()),
StructField('EXT10', IntegerType()), StructField('EST1', IntegerType()), StructField('EST2', IntegerType()),
StructField('EST3', IntegerType()), StructField('EST4', IntegerType()), StructField('EST5', IntegerType()),
StructField('EST6', IntegerType()), StructField('EST7', IntegerType()), StructField('EST8', IntegerType()),
StructField('EST9', IntegerType()), StructField('EST10', IntegerType()), StructField('AGR1', IntegerType()),
StructField('AGR2', IntegerType()), StructField('AGR3', IntegerType()), StructField('AGR4', IntegerType()),
StructField('AGR5', IntegerType()), StructField('AGR6', IntegerType()), StructField('AGR7', IntegerType()),
StructField('AGR8', IntegerType()), StructField('AGR9', IntegerType()), StructField('AGR10', IntegerType()),
StructField('CSN1', IntegerType()), StructField('CSN2', IntegerType()), StructField('CSN3', IntegerType()),
StructField('CSN4', IntegerType()), StructField('CSN5', IntegerType()), StructField('CSN6', IntegerType()),
StructField('CSN7', IntegerType()), StructField('CSN8', IntegerType()), StructField('CSN9', IntegerType()),
StructField('CSN10', IntegerType()), StructField('OPN1', IntegerType()), StructField('OPN2', IntegerType()),
StructField('OPN3', IntegerType()), StructField('OPN4', IntegerType()), StructField('OPN5', IntegerType()),
StructField('OPN6', IntegerType()), StructField('OPN7', IntegerType()), StructField('OPN8', IntegerType()),
StructField('OPN9', IntegerType()), StructField('OPN10', IntegerType()), StructField('EXT1_E', FloatType()),
StructField('EXT2_E', FloatType()), StructField('EXT3_E', FloatType()), StructField('EXT4_E', FloatType()),
StructField('EXT5_E', FloatType()), StructField('EXT6_E', FloatType()), StructField('EXT7_E', FloatType()),
StructField('EXT8_E', FloatType()), StructField('EXT9_E', FloatType()), StructField('EXT10_E', FloatType()),
StructField('EST1_E', FloatType()), StructField('EST2_E', FloatType()), StructField('EST3_E', FloatType()),
StructField('EST4_E', FloatType()), StructField('EST5_E', FloatType()), StructField('EST6_E', FloatType()),
StructField('EST7_E', FloatType()), StructField('EST8_E', FloatType()), StructField('EST9_E', FloatType()),
StructField('EST10_E', FloatType()), StructField('AGR1_E', FloatType()), StructField('AGR2_E', FloatType()),
StructField('AGR3_E', FloatType()), StructField('AGR4_E', FloatType()), StructField('AGR5_E', FloatType()),
StructField('AGR6_E', FloatType()), StructField('AGR7_E', FloatType()), StructField('AGR8_E', FloatType()),
StructField('AGR9_E', FloatType()), StructField('AGR10_E', FloatType()), StructField('CSN1_E', FloatType()),
StructField('CSN2_E', FloatType()), StructField('CSN3_E', FloatType()), StructField('CSN4_E', FloatType()),
StructField('CSN5_E', FloatType()), StructField('CSN6_E', FloatType()), StructField('CSN7_E', FloatType()),
StructField('CSN8_E', FloatType()), StructField('CSN9_E', FloatType()), StructField('CSN10_E', FloatType()),
StructField('OPN1_E', FloatType()), StructField('OPN2_E', FloatType()), StructField('OPN3_E', FloatType()),
StructField('OPN4_E', FloatType()), StructField('OPN5_E', FloatType()), StructField('OPN6_E', FloatType()),
StructField('OPN7_E', FloatType()), StructField('OPN8_E', FloatType()), StructField('OPN9_E', FloatType()),
StructField('OPN10_E', FloatType()), StructField('dateload', TimestampType()), StructField('screenw', IntegerType()),
StructField('screenh', IntegerType()), StructField('introelapse', IntegerType()), StructField('testelapse', FloatType()),
StructField('endelapse', FloatType()), StructField('IPC', IntegerType()), StructField('country', StringType()),
StructField('lat_appx_lots_of_err', FloatType()), StructField('long_appx_lots_of_err', FloatType())])
bigfive = spark.read.format('csv').options(header=True, delimiter='\t', timeStampFormat='yyyy-MM-dd HH:mm:ss').schema(schem).load(fil)

# add an ID - don't actually care if it's monotonic; also filter for IP count is 1
bigfive = bigfive.where(col('IPC') == 1).select(monotonically_increasing_id().alias('id'), '*')

# talk
cnt = bigfive.count()
print('%d records'%cnt)
bigfive.show(truncate=False)
#bigfive.printSchema()

696845 records
+---+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+--------+------+-------+------+------+-------+------+-------+-------+-------+-------+------+-------+--------+------+------+-------+------+-------+-------+-------+-------+-------+------+------+-------+-------+-------+------+-------+-------+-------+------+-------+------+------+-------+-------+------+-------+-------+------+-------+------+------+-------+-------+-------+------+-------+-------------------+-------+-------+-----------+----------+---------+---+-------+--------------------+---------------------+
|id |EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN

### Data Prep

In [None]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:bigfive.select(colm).where(col(colm).isNull()).count() for colm in bigfive.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in bigfive.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
#house = house.dropna(how='any')

# talk some more
print('%d records'%bigfive.count())

In [None]:
''' OHE the ocean_proximity var '''
# first review the distribution
tab = house.groupBy('ocean_proximity').count().toPandas().sort_values(by='ocean_proximity')
display(tab)

# first need to string index ...
indx = StringIndexer(inputCol='ocean_proximity', outputCol='oceanProx_int', stringOrderType='alphabetAsc')
house = indx.fit(house).transform(house)

# ... then we can encode
ohe = OneHotEncoder(inputCol='oceanProx_int', outputCol='oceanProx')
house = ohe.fit(house).transform(house).drop('oceanProx_int')

# make the OHE columns - last is excluded; when all are 0, it's the last
featOHE = ['oceanProx_%s'%c for c in tab.ocean_proximity.values[:-1]]
print(featOHE)

# talk
house.show(truncate=False)

In [None]:
# prepare the response
house = house.withColumnRenamed('median_house_value', 'label')

In [None]:
''' prepare the features '''
# get the features
features = [c for c in house.columns if c not in (['ocean_proximity', 'label', 'id'])]

# create & scale the features vector
assr = VectorAssembler(inputCols=features, outputCol='features_raw')
scalr = MinMaxScaler(inputCol='features_raw', outputCol='features')
pipe = Pipeline(stages=[assr, scalr]).fit(house)
house = pipe.transform(house).drop('features_raw')

# now update the features list with the ocean proximity OHE columns; this assumes
# oceanProx was the last column, which should be true
features = features[:-1] + featOHE
print(features)

# talk
display(house.limit(10).toPandas())
house.select('id', 'features', 'label').show(truncate=False)
house.select('features').take(1)
print('First row features = %s'%house.select('features').take(1)[0])

In [None]:
# check for multicollinearity
# high: total_bedrooms vs. households, population vs. households, 
corr = Correlation.corr(house, column='features', method='pearson')
corrdf = pd.DataFrame(index=features, data=corr.collect()[0][0].toArray(), columns=features)
display(corrdf)

## Modeling

In [None]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = house.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

In [None]:
''' set up the estimators & param grids '''
models = {}

'''# linear regression
linreg = LinearRegression()
params = (ParamGridBuilder().addGrid(linreg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build())
paramNames = ['elasticnetparam']
models['linear regression'] = [linreg, params, paramNames, None, None]

# random forest
ranfor = RandomForestRegressor(numTrees=20)
params = (ParamGridBuilder().addGrid(ranfor.maxBins, [20, 40, 80, 100])\
              .addGrid(ranfor.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['random forest'] = [ranfor, params, paramNames, None, None]'''

# gradient boosting trees
gradbst = GBTRegressor(maxIter=20)
params = (ParamGridBuilder().addGrid(gradbst.maxBins, [20, 40, 80, 100])\
              .addGrid(gradbst.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['gradient boost'] = [gradbst, params, paramNames, None, None]

In [None]:
''' run the models '''
# number of cv folds
folds = 5
# define the evaulation function
evl = RegressionEvaluator(metricName='rmse')

# iterate over models
for (model, stuff) in models.items():
    print('Cross Validator: %s'%model)
    # execute
    cv = CrossValidator(estimator=stuff[0], estimatorParamMaps=stuff[1], evaluator=evl, numFolds=folds)
    fitModel = cv.fit(house.select('features', 'label'))
    # get the best
    bestModel = fitModel.bestModel
    # evaluate performance on the test set
    testRMSE = evl.evaluate(bestModel.transform(test.select('features', 'label')))
    print('\tBest Model Test RMSE = %0.3f'%testRMSE)    
    # get best parameters
    bestParams = bestModel.extractParamMap()
    for (key, val) in bestParams.items():
        for parm in stuff[2]:
            if parm in key.name.lower():
                print('\t%s = %0.2f'%(key, val))
                break
    # save stuff
    models[model][3] = fitModel
    models[model][4] = testRMSE

In [None]:
# look at linear regression coefficients
bm = models['linear regression'][3].bestModel
summ = bm.summary
summ.predictions.describe().withColumn('Diff', col('prediction') - col('label')).show(truncate=False)
print('Best Model RMSE = %0.3f'%summ.rootMeanSquaredError)

# get a nice model coefficients table
coefs = pd.concat([pd.DataFrame(index=['Intercept'], data=[bm.intercept], columns=['Coefficient']),
                   pd.DataFrame(index=features, data=bm.coefficients.toArray(), columns=['Coefficient'])])
coefs['Std. Error'] = bm.summary.coefficientStandardErrors
coefs['pValue'] = bm.summary.pValues
# make an absolute coef column temporarily for sorting
coefs['tmp'] = coefs['Coefficient'].abs() a
coefs.loc['Intercept', 'tmp'] = np.inf
coefs = coefs.sort_values(by='tmp', ascending=False).drop(columns='tmp')

# talk
display(coefs)

In [None]:
# view feature importances for random forest
imports = models['random forest'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
# view feature importances for gradient boost
imports = models['gradient boost'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
sc.stop()