In [18]:
import pandas as pd
import numpy as np
import datetime as dt
from itertools import product

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler, StringIndexer, Interaction
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'regressionHW'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [13]:
''' get the data '''
# load the data
fil = '../../data/Concrete_Data.csv'
schem = StructType([StructField('cement', FloatType()), StructField('slag', FloatType()),
                    StructField('flyash', FloatType()), StructField('water', FloatType()),
                    StructField('superplasticizer', FloatType()), StructField('coarseaggregate', FloatType()),
                    StructField('fineaggregate', FloatType()), StructField('age', FloatType()),
                    StructField('csMPa', FloatType())])
concrete = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# add an ID - don't actually care if it's monotonic
concrete = concrete.select(monotonically_increasing_id().alias('id'), '*')

# talk
cnt = concrete.count()
print('%d records'%cnt)
concrete.show(truncate=False)

1030 records
+---+------+-----+------+-----+----------------+---------------+-------------+-----+-----+
|id |cement|slag |flyash|water|superplasticizer|coarseaggregate|fineaggregate|age  |csMPa|
+---+------+-----+------+-----+----------------+---------------+-------------+-----+-----+
|0  |540.0 |0.0  |0.0   |162.0|2.5             |1040.0         |676.0        |28.0 |79.99|
|1  |540.0 |0.0  |0.0   |162.0|2.5             |1055.0         |676.0        |28.0 |61.89|
|2  |332.5 |142.5|0.0   |228.0|0.0             |932.0          |594.0        |270.0|40.27|
|3  |332.5 |142.5|0.0   |228.0|0.0             |932.0          |594.0        |365.0|41.05|
|4  |198.6 |132.4|0.0   |192.0|0.0             |978.4          |825.5        |360.0|44.3 |
|5  |266.0 |114.0|0.0   |228.0|0.0             |932.0          |670.0        |90.0 |47.03|
|6  |380.0 |95.0 |0.0   |228.0|0.0             |932.0          |594.0        |365.0|43.7 |
|7  |380.0 |95.0 |0.0   |228.0|0.0             |932.0          |594.0        

### Data Prep

In [5]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:concrete.select(colm).where(col(colm).isNull()).count() for colm in concrete.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in concrete.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
#concrete = concrete.dropna(how='any')

# talk some more
print('%d records'%concrete.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,id,0.0,0.0,LongType
1,cement,0.0,0.0,FloatType
2,slag,0.0,0.0,FloatType
3,flyash,0.0,0.0,FloatType
4,water,0.0,0.0,FloatType
5,superplasticizer,0.0,0.0,FloatType
6,coarseaggregate,0.0,0.0,FloatType
7,fineaggregate,0.0,0.0,FloatType
8,age,0.0,0.0,FloatType
9,csMPa,0.0,0.0,FloatType


1030 records


In [14]:
# prepare the response
concrete = concrete.withColumnRenamed('csMPa', 'label')

In [15]:
''' create the squared features'''
# make the lists
features = [c for c in concrete.columns if c not in (['label', 'id'])]
# square them
for feat in features:
    concrete = concrete.withColumn(feat+'_sq', col(feat)*col(feat))
# talk
concrete.show(truncate=False)

+---+------+-----+------+-----+----------------+---------------+-------------+-----+-----+---------+---------+---------+--------+-------------------+------------------+----------------+--------+
|id |cement|slag |flyash|water|superplasticizer|coarseaggregate|fineaggregate|age  |label|cement_sq|slag_sq  |flyash_sq|water_sq|superplasticizer_sq|coarseaggregate_sq|fineaggregate_sq|age_sq  |
+---+------+-----+------+-----+----------------+---------------+-------------+-----+-----+---------+---------+---------+--------+-------------------+------------------+----------------+--------+
|0  |540.0 |0.0  |0.0   |162.0|2.5             |1040.0         |676.0        |28.0 |79.99|291600.0 |0.0      |0.0      |26244.0 |6.25               |1081600.0         |456976.0        |784.0   |
|1  |540.0 |0.0  |0.0   |162.0|2.5             |1055.0         |676.0        |28.0 |61.89|291600.0 |0.0      |0.0      |26244.0 |6.25               |1113025.0         |456976.0        |784.0   |
|2  |332.5 |142.5|0.0   |

In [22]:
''' create the interaction features '''
# iterate over all pairs
for (a, b) in product(features, features):
    if a != b:
        intr = Interaction(inputCols=[a,b], outputCol=a+'_'+b)
        concrete = intr.transform(concrete)
# talk
concrete.show(truncate=False)

+---+------+-----+------+-----+----------------+---------------+-------------+-----+-----+---------+---------+---------+--------+-------------------+------------------+----------------+--------+--------------------+-------------+-----------------+-----------------------+----------------------+--------------------+--------------------+--------------------+-----------+-----------------+---------------------+--------------------+--------------------+--------------------+-------------+-----------+------------+-----------------------+----------------------+--------------------+----------+-----------------+-----------------+------------+----------------------+---------------------+-------------------+---------+-----------------------+---------------------+-----------------------+----------------------+--------------------------------+------------------------------+--------------------+----------------------+--------------------+----------------------+---------------------+-------------------

In [23]:
''' prepare the features '''
# get the features
features = [c for c in concrete.columns if c not in (['label', 'id'])]
print(features)

# create & scale the features vector
assr = VectorAssembler(inputCols=features, outputCol='features_raw')
scalr = MinMaxScaler(inputCol='features_raw', outputCol='features')
pipe = Pipeline(stages=[assr, scalr]).fit(concrete)
concrete = pipe.transform(concrete).drop('features_raw')

# talk
display(concrete.limit(10).toPandas())
concrete.select('id', 'features', 'label').show(truncate=False)
concrete.select('features').take(1)
print('First row features = %s'%concrete.select('features').take(1)[0])

['cement', 'slag', 'flyash', 'water', 'superplasticizer', 'coarseaggregate', 'fineaggregate', 'age', 'cement_sq', 'slag_sq', 'flyash_sq', 'water_sq', 'superplasticizer_sq', 'coarseaggregate_sq', 'fineaggregate_sq', 'age_sq', 'cement_slag', 'cement_flyash', 'cement_water', 'cement_superplasticizer', 'cement_coarseaggregate', 'cement_fineaggregate', 'cement_age', 'slag_cement', 'slag_flyash', 'slag_water', 'slag_superplasticizer', 'slag_coarseaggregate', 'slag_fineaggregate', 'slag_age', 'flyash_cement', 'flyash_slag', 'flyash_water', 'flyash_superplasticizer', 'flyash_coarseaggregate', 'flyash_fineaggregate', 'flyash_age', 'water_cement', 'water_slag', 'water_flyash', 'water_superplasticizer', 'water_coarseaggregate', 'water_fineaggregate', 'water_age', 'superplasticizer_cement', 'superplasticizer_slag', 'superplasticizer_flyash', 'superplasticizer_water', 'superplasticizer_coarseaggregate', 'superplasticizer_fineaggregate', 'superplasticizer_age', 'coarseaggregate_cement', 'coarseaggre

Unnamed: 0,id,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,label,...,fineaggregate_coarseaggregate,fineaggregate_age,age_cement,age_slag,age_flyash,age_water,age_superplasticizer,age_coarseaggregate,age_fineaggregate,features
0,0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,79.989998,...,[703040.0],[18928.0],[15120.0],[0.0],[0.0],[4536.0],[70.0],[29120.0],[18928.0],"(1.0, 0.0, 0.0, 0.3210862454322655, 0.07763974..."
1,1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,61.889999,...,[713180.0],[18928.0],[15120.0],[0.0],[0.0],[4536.0],[70.0],[29540.0],[18928.0],"(1.0, 0.0, 0.0, 0.3210862454322655, 0.07763974..."
2,2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0,40.27,...,[553608.0],[160380.0],[89775.0],[38475.0],[0.0],[61560.0],[0.0],[251640.0],[160380.0],"(0.526255707762557, 0.39649416366168144, 0.0, ..."
3,3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0,41.049999,...,[553608.0],[216810.0],[121362.5],[52012.5],[0.0],[83220.0],[0.0],[340180.0],[216810.0],"(0.526255707762557, 0.39649416366168144, 0.0, ..."
4,4,198.600006,132.399994,0.0,192.0,0.0,978.400024,825.5,360.0,44.299999,...,[807669.2201538086],[297180.0],[71496.00219726562],[47663.997802734375],[0.0],[69120.0],[0.0],[352224.0087890625],[297180.0],"(0.22054795914044661, 0.3683917533249004, 0.0,..."
5,5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90.0,47.029999,...,[624440.0],[60300.0],[23940.0],[10260.0],[0.0],[20520.0],[0.0],[83880.0],[60300.0],"(0.3744292237442922, 0.31719533092934515, 0.0,..."
6,6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,365.0,43.700001,...,[553608.0],[216810.0],[138700.0],[34675.0],[0.0],[83220.0],[0.0],[340180.0],[216810.0],"(0.634703196347032, 0.264329442441121, 0.0, 0...."
7,7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28.0,36.450001,...,[553608.0],[16632.0],[10640.0],[2660.0],[0.0],[6384.0],[0.0],[26096.0],[16632.0],"(0.634703196347032, 0.264329442441121, 0.0, 0...."
8,8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28.0,45.849998,...,[624440.0],[18760.0],[7448.0],[3192.0],[0.0],[6384.0],[0.0],[26096.0],[18760.0],"(0.3744292237442922, 0.31719533092934515, 0.0,..."
9,9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28.0,39.290001,...,[553608.0],[16632.0],[13300.0],[0.0],[0.0],[6384.0],[0.0],[26096.0],[16632.0],"(0.8515981735159817, 0.0, 0.0, 0.8482428078025..."


+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|id |features                          

In [24]:
# check for multicollinearity
# high: 
corr = Correlation.corr(concrete, column='features', method='pearson')
corrdf = pd.DataFrame(index=features, data=corr.collect()[0][0].toArray(), columns=features)
display(corrdf)

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,cement_sq,slag_sq,...,fineaggregate_superplasticizer,fineaggregate_coarseaggregate,fineaggregate_age,age_cement,age_slag,age_flyash,age_water,age_superplasticizer,age_coarseaggregate,age_fineaggregate
cement,1.000000,-0.275216,-0.397467,-0.081587,0.092386,-0.109349,-0.222718,0.081946,0.982918,-0.263361,...,0.102353,-0.289801,0.051942,0.286319,-0.111547,-0.307658,0.081596,0.052432,0.079233,0.051942
slag,-0.275216,1.000000,-0.323580,0.107252,0.043270,-0.283999,-0.281603,-0.044246,-0.252955,0.938881,...,0.013891,-0.439001,-0.069224,-0.103530,0.463207,-0.248242,-0.029918,0.011282,-0.059093,-0.069224
flyash,-0.397467,-0.323580,1.000000,-0.256984,0.377503,-0.009961,0.079109,-0.154371,-0.377955,-0.321191,...,0.347125,0.080499,-0.141664,-0.221790,-0.215759,0.690352,-0.174483,0.232739,-0.149140,-0.141664
water,-0.081587,0.107252,-0.256984,1.000000,-0.657533,-0.182294,-0.450661,0.277618,-0.084376,0.123723,...,-0.674862,-0.499785,0.221800,0.261116,0.238286,-0.219198,0.353335,-0.461811,0.260321,0.221800
superplasticizer,0.092386,0.043270,0.377503,-0.657533,1.000000,-0.265999,0.222691,-0.192700,0.094161,-0.032298,...,0.989603,0.003560,-0.170206,-0.171058,-0.081827,0.246073,-0.234908,0.648698,-0.200512,-0.170206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
age_flyash,-0.307658,-0.248242,0.690352,-0.219198,0.246073,0.081584,0.090414,0.097970,-0.297351,-0.239758,...,0.230602,0.147081,0.129614,-0.022555,-0.120553,1.000000,0.040675,0.484138,0.114407,0.129614
age_water,0.081596,-0.029918,-0.174483,0.353335,-0.234908,-0.024739,-0.202127,0.991072,0.070980,-0.039160,...,-0.225924,-0.186395,0.960759,0.932401,0.561480,0.040675,1.000000,0.025500,0.983190,0.960759
age_superplasticizer,0.052432,0.011282,0.232739,-0.461811,0.648698,-0.118615,0.180348,0.109230,0.051071,-0.036722,...,0.649756,0.069738,0.152273,0.095757,0.076461,0.484138,0.025500,1.000000,0.106983,0.152273
age_coarseaggregate,0.079233,-0.059093,-0.149140,0.260321,-0.200512,0.044597,-0.159743,0.997099,0.071478,-0.062502,...,-0.191356,-0.105377,0.983595,0.934666,0.517942,0.114407,0.983190,0.106983,1.000000,0.983595


## Modeling

In [None]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = concrete.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

Training Cases


In [10]:
''' set up the estimators & param grids '''
models = {}

# linear regression
linreg = LinearRegression()
params = (ParamGridBuilder().addGrid(linreg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build())
paramNames = ['elasticnetparam']
models['linear regression'] = [linreg, params, paramNames, None, None]

# random forest
ranfor = RandomForestRegressor(numTrees=20)
params = (ParamGridBuilder().addGrid(ranfor.maxBins, [20, 40, 80, 100])\
              .addGrid(ranfor.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['random forest'] = [ranfor, params, paramNames, None, None]

# gradient boosting trees
gradbst = GBTRegressor(maxIter=20)
params = (ParamGridBuilder().addGrid(gradbst.maxBins, [20, 40, 80, 100])\
              .addGrid(gradbst.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['gradient boost'] = [gradbst, params, paramNames, None, None]

In [11]:
''' run the models '''
# number of cv folds
folds = 5
# define the evaulation function
evl = RegressionEvaluator(metricName='rmse')

# iterate over models
for (model, stuff) in models.items():
    print('Cross Validator: %s'%model)
    # execute
    cv = CrossValidator(estimator=stuff[0], estimatorParamMaps=stuff[1], evaluator=evl, numFolds=folds)
    fitModel = cv.fit(concrete.select('features', 'label'))
    # get the best
    bestModel = fitModel.bestModel
    # evaluate performance on the test set
    testRMSE = evl.evaluate(bestModel.transform(test.select('features', 'label')))
    print('\tBest Model Test RMSE = %0.3f'%testRMSE)    
    # get best parameters
    bestParams = bestModel.extractParamMap()
    for (key, val) in bestParams.items():
        for parm in stuff[2]:
            if parm in key.name.lower():
                print('\t%s = %0.2f'%(key, val))
                break
    # save stuff
    models[model][3] = fitModel
    models[model][4] = testRMSE

Cross Validator: linear regression
	Best Model Test RMSE = 10.580
	LinearRegression_f8d856dc40ba__elasticNetParam = 0.00
Cross Validator: random forest


KeyboardInterrupt: 

In [None]:
# look at linear regression coefficients
bm = models['linear regression'][3].bestModel
summ = bm.summary
summ.predictions.describe().withColumn('Diff', col('prediction') - col('label')).show(truncate=False)
print('Best Model RMSE = %0.3f'%summ.rootMeanSquaredError)

# get a nice model coefficients table
coefs = pd.concat([pd.DataFrame(index=['Intercept'], data=[bm.intercept], columns=['Coefficient']),
                   pd.DataFrame(index=features, data=bm.coefficients.toArray(), columns=['Coefficient'])])
coefs['Std. Error'] = bm.summary.coefficientStandardErrors
coefs['pValue'] = bm.summary.pValues
# make an absolute coef column temporarily for sorting
coefs['tmp'] = coefs['Coefficient'].abs() a
coefs.loc['Intercept', 'tmp'] = np.inf
coefs = coefs.sort_values(by='tmp', ascending=False).drop(columns='tmp')

# talk
display(coefs)

In [None]:
# view feature importances for random forest
imports = models['random forest'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
# view feature importances for gradient boost
imports = models['gradient boost'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
sc.stop()