In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler, StringIndexer
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'regression'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
''' get the data '''
# load the data
fil = '../data/housing.csv'
schem = StructType([StructField('longitude', FloatType()), StructField('latitude', FloatType()),
                    StructField('housing_median_age', FloatType()), StructField('total_rooms', FloatType()),
                    StructField('total_bedrooms', FloatType()), StructField('population', FloatType()),
                    StructField('households', FloatType()), StructField('median_income', FloatType()),
                    StructField('median_house_value', FloatType()), StructField('ocean_proximity', StringType())])
house = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# add an ID - don't actually care if it's monotonic
house = house.select(monotonically_increasing_id().alias('id'), '*')

# talk
cnt = house.count()
print('%d records'%cnt)
house.show(truncate=False)

20640 records
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|id |longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|0  |-122.23  |37.88   |41.0              |880.0      |129.0         |322.0     |126.0     |8.3252       |452600.0          |NEAR BAY       |
|1  |-122.22  |37.86   |21.0              |7099.0     |1106.0        |2401.0    |1138.0    |8.3014       |358500.0          |NEAR BAY       |
|2  |-122.24  |37.85   |52.0              |1467.0     |190.0         |496.0     |177.0     |7.2574       |352100.0          |NEAR BAY       |
|3  |-122.25  |37.85   |52.0              |1274.0     |235.0         |558.0     |219.0     |5.6431       |341300.0          |NEAR BAY 

### Data Prep

In [4]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:house.select(colm).where(col(colm).isNull()).count() for colm in house.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in house.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
house = house.dropna(how='any')

# talk some more
print('%d records'%house.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,total_bedrooms,207.0,0.010029,FloatType
1,id,0.0,0.0,LongType
2,longitude,0.0,0.0,FloatType
3,latitude,0.0,0.0,FloatType
4,housing_median_age,0.0,0.0,FloatType
5,total_rooms,0.0,0.0,FloatType
6,population,0.0,0.0,FloatType
7,households,0.0,0.0,FloatType
8,median_income,0.0,0.0,FloatType
9,median_house_value,0.0,0.0,FloatType


20433 records


In [5]:
''' OHE the ocean_proximity var '''
# first review the distribution
tab = house.groupBy('ocean_proximity').count().toPandas().sort_values(by='ocean_proximity')
display(tab)

# first need to string index ...
indx = StringIndexer(inputCol='ocean_proximity', outputCol='oceanProx_int', stringOrderType='alphabetAsc')
house = indx.fit(house).transform(house)

# ... then we can encode
ohe = OneHotEncoder(inputCol='oceanProx_int', outputCol='oceanProx')
house = ohe.fit(house).transform(house).drop('oceanProx_int')

# make the OHE columns - last is excluded; when all are 0, it's the last
featOHE = ['oceanProx_%s'%c for c in tab.ocean_proximity.values[:-1]]
print(featOHE)

# talk
house.show(truncate=False)

Unnamed: 0,ocean_proximity,count
3,<1H OCEAN,9034
4,INLAND,6496
0,ISLAND,5
2,NEAR BAY,2270
1,NEAR OCEAN,2628


['oceanProx_<1H OCEAN', 'oceanProx_INLAND', 'oceanProx_ISLAND', 'oceanProx_NEAR BAY']
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------------+
|id |longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|oceanProx    |
+---+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------------+
|0  |-122.23  |37.88   |41.0              |880.0      |129.0         |322.0     |126.0     |8.3252       |452600.0          |NEAR BAY       |(4,[3],[1.0])|
|1  |-122.22  |37.86   |21.0              |7099.0     |1106.0        |2401.0    |1138.0    |8.3014       |358500.0          |NEAR BAY       |(4,[3],[1.0])|
|2  |-122.24  |37.85   |52.0              |1467.0     |190.0         |496.0     |177.0     |7.2574       |352100.0          |NEAR BAY 

In [6]:
# prepare the response
house = house.withColumnRenamed('median_house_value', 'label')

In [7]:
''' prepare the features '''
# get the features
features = [c for c in house.columns if c not in (['ocean_proximity', 'label', 'id'])]

# create & scale the features vector
assr = VectorAssembler(inputCols=features, outputCol='features_raw')
scalr = MinMaxScaler(inputCol='features_raw', outputCol='features')
pipe = Pipeline(stages=[assr, scalr]).fit(house)
house = pipe.transform(house).drop('features_raw')

# now update the features list with the ocean proximity OHE columns; this assumes
# oceanProx was the last column, which should be true
features = features[:-1] + featOHE
print(features)

# talk
display(house.limit(10).toPandas())
house.select('id', 'features', 'label').show(truncate=False)
house.select('features').take(1)
print('First row features = %s'%house.select('features').take(1)[0])

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'oceanProx_<1H OCEAN', 'oceanProx_INLAND', 'oceanProx_ISLAND', 'oceanProx_NEAR BAY']


Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,label,ocean_proximity,oceanProx,features
0,0,-122.230003,37.880001,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.21115487289536256, 0.5674814281805192, 0.78..."
1,1,-122.220001,37.860001,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2121511015127325, 0.5653559809873215, 0.392..."
2,2,-122.239998,37.849998,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.21015940417777704, 0.5642930546969221, 1.0,..."
3,3,-122.25,37.849998,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2091631755604071, 0.5642930546969221, 1.0, ..."
4,4,-122.25,37.849998,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2091631755604071, 0.5642930546969221, 1.0, ..."
5,5,-122.25,37.849998,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2091631755604071, 0.5642930546969221, 1.0, ..."
6,6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2091631755604071, 0.5632305337941239, 1.0, ..."
7,7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2091631755604071, 0.5632305337941239, 1.0, ..."
8,8,-122.260002,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.20816694694303714, 0.5632305337941239, 0.80..."
9,9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[0.2091631755604071, 0.5632305337941239, 1.0, ..."


+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
|id |features                                                                                                                                                                         |label   |
+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
|0  |[0.21115487289536256,0.5674814281805192,0.7843137254901961,0.022330739101683705,0.019863438857852266,0.008940833543541018,0.020555829633284,0.5396684184702716,0.0,0.0,0.0,1.0]  |452600.0|
|1  |[0.2121511015127325,0.5653559809873215,0.39215686274509803,0.180502568798006,0.17147734326505276,0.06721040387903249,0.18697582634435125,0.5380270689689535,0.0,0.0,0.0,1.0]     |358500.0|
|2  |[0.21015940417777704,0.5642930

In [8]:
# check for multicollinearity
# high: total_bedrooms vs. households, population vs. households, 
corr = Correlation.corr(house, column='features', method='pearson')
corrdf = pd.DataFrame(index=features, data=corr.collect()[0][0].toArray(), columns=features)
display(corrdf)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,oceanProx_<1H OCEAN,oceanProx_INLAND,oceanProx_ISLAND,oceanProx_NEAR BAY
longitude,1.0,-0.924616,-0.109357,0.04548,0.069608,0.10027,0.056513,-0.01555,0.320831,-0.055337,0.009501,-0.474714
latitude,-0.924616,1.0,0.011899,-0.036667,-0.066983,-0.108997,-0.071774,-0.079626,-0.446928,0.351084,-0.016662,0.358785
housing_median_age,-0.109357,0.011899,1.0,-0.360628,-0.320451,-0.295787,-0.302768,-0.118278,0.045553,-0.236968,0.017105,0.256149
total_rooms,0.04548,-0.036667,-0.360628,1.0,0.93038,0.857281,0.918992,0.197882,-0.003777,0.026477,-0.007603,-0.023647
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.018314,-0.006463,-0.004361,-0.019873
population,0.10027,-0.108997,-0.295787,0.857281,0.877747,1.0,0.907186,0.005087,0.07345,-0.019602,-0.010451,-0.06148
households,0.056513,-0.071774,-0.302768,0.918992,0.979728,0.907186,1.0,0.013434,0.041883,-0.038265,-0.009119,-0.01128
median_income,-0.01555,-0.079626,-0.118278,0.197882,-0.007723,0.005087,0.013434,1.0,0.168715,-0.237536,-0.009281,0.056677
oceanProx_<1H OCEAN,0.320831,-0.446928,0.045553,-0.003777,0.018314,0.07345,0.041883,0.168715,1.0,-0.607778,-0.013928,-0.314721
oceanProx_INLAND,-0.055337,0.351084,-0.236968,0.026477,-0.006463,-0.019602,-0.038265,-0.237536,-0.607778,1.0,-0.010681,-0.241356


## Modeling

In [9]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = house.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

Training Cases
+---+
| id|
+---+
|  0|
|  1|
|  3|
|  4|
|  5|
|  7|
| 10|
| 11|
| 12|
| 16|
| 17|
| 18|
| 20|
| 22|
| 25|
| 26|
| 27|
| 31|
| 33|
| 36|
+---+
only showing top 20 rows

Testing Cases
+---+
| id|
+---+
|  2|
|  6|
|  8|
|  9|
| 13|
| 14|
| 15|
| 19|
| 21|
| 23|
| 24|
| 28|
| 29|
| 30|
| 32|
| 34|
| 35|
| 39|
| 42|
| 43|
+---+
only showing top 20 rows



In [13]:
''' set up the estimators & param grids '''
models = {}

'''# linear regression
linreg = LinearRegression()
params = (ParamGridBuilder().addGrid(linreg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build())
paramNames = ['elasticnetparam']
models['linear regression'] = [linreg, params, paramNames, None, None]

# random forest
ranfor = RandomForestRegressor(numTrees=20)
params = (ParamGridBuilder().addGrid(ranfor.maxBins, [20, 40, 80, 100])\
              .addGrid(ranfor.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['random forest'] = [ranfor, params, paramNames, None, None]'''

# gradient boosting trees
gradbst = GBTRegressor(maxIter=20)
params = (ParamGridBuilder().addGrid(gradbst.maxBins, [20, 40, 80, 100])\
              .addGrid(gradbst.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['gradient boost'] = [gradbst, params, paramNames, None, None]

In [14]:
''' run the models '''
# number of cv folds
folds = 5
# define the evaulation function
evl = RegressionEvaluator(metricName='rmse')

# iterate over models
for (model, stuff) in models.items():
    print('Cross Validator: %s'%model)
    # execute
    cv = CrossValidator(estimator=stuff[0], estimatorParamMaps=stuff[1], evaluator=evl, numFolds=folds)
    fitModel = cv.fit(house.select('features', 'label'))
    # get the best
    bestModel = fitModel.bestModel
    # evaluate performance on the test set
    testRMSE = evl.evaluate(bestModel.transform(test.select('features', 'label')))
    print('\tBest Model Test RMSE = %0.3f'%testRMSE)    
    # get best parameters
    bestParams = bestModel.extractParamMap()
    for (key, val) in bestParams.items():
        for parm in stuff[2]:
            if parm in key.name.lower():
                print('\t%s = %0.2f'%(key, val))
                break
    # save stuff
    models[model][3] = fitModel
    models[model][4] = testRMSE

Cross Validator: gradient boost


Exception ignored in: <function JavaWrapper.__del__ at 0x7f43b31e00d0>
Traceback (most recent call last):
  File "/home/ahowe42/spark-3.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'GBTRegressor' object has no attribute '_java_obj'


	Best Model Test RMSE = 52606.504
	GBTRegressor_ecf8280cd689__maxBins = 80.00
	GBTRegressor_ecf8280cd689__maxDepth = 5.00


In [31]:
# look at linear regression coefficients
bm = models['linear regression'][3].bestModel
summ = bm.summary
summ.predictions.describe().withColumn('Diff', col('prediction') - col('label')).show(truncate=False)
print('Best Model RMSE = %0.3f'%summ.rootMeanSquaredError)

# get a nice model coefficients table
coefs = pd.concat([pd.DataFrame(index=['Intercept'], data=[bm.intercept], columns=['Coefficient']),
                   pd.DataFrame(index=features, data=bm.coefficients.toArray(), columns=['Coefficient'])])
coefs['Std. Error'] = bm.summary.coefficientStandardErrors
coefs['pValue'] = bm.summary.pValues
# make an absolute coef column temporarily for sorting
coefs['tmp'] = coefs['Coefficient'].abs() a
coefs.loc['Intercept', 'tmp'] = np.inf
coefs = coefs.sort_values(by='tmp', ascending=False).drop(columns='tmp')

# talk
display(coefs)

+-------+------------------+------------------+----------------------+
|summary|label             |prediction        |Diff                  |
+-------+------------------+------------------+----------------------+
|count  |20433             |20433             |0.0                   |
|mean   |206864.41315519012|206864.41315518811|-2.0081643015146255E-9|
|stddev |115435.66709858322|92813.7103944438  |-22621.95670413942    |
|min    |14999.0           |-644652.2567202692|-659651.2567202692    |
|max    |500001.0          |691986.8212334897 |191985.82123348967    |
+-------+------------------+------------------+----------------------+

Best Model RMSE = 68635.106


Unnamed: 0,Coefficient,Std. Error,pValue
Intercept,260051.2,10237.292715,0.0
population,-1354699.0,45311.420373,2.829537e-11
total_bedrooms,647984.7,38395.877301,0.0
median_income,569271.7,1569.525384,0.006421018
households,301722.9,4901.136491,0.0
longitude,-269202.5,9454.244133,0.0
total_rooms,-243506.7,44260.659646,0.0
latitude,-239787.4,2238.171037,0.0
oceanProx_ISLAND,148623.8,2175.788135,0.0001550527
housing_median_age,54698.52,31118.993572,5.329071e-15


In [32]:
# view feature importances for random forest
imports = models['random forest'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

Unnamed: 0,Importance
median_income,0.372008
oceanProx_INLAND,0.166583
longitude,0.11812
latitude,0.107858
population,0.052327
housing_median_age,0.051503
total_rooms,0.040377
total_bedrooms,0.03762
households,0.032892
oceanProx_<1H OCEAN,0.013373


In [15]:
# view feature importances for gradient boost
imports = models['gradient boost'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

Unnamed: 0,Importance
median_income,0.265431
longitude,0.176374
latitude,0.163965
population,0.089573
total_bedrooms,0.081516
housing_median_age,0.077146
oceanProx_INLAND,0.068559
oceanProx_NEAR BAY,0.03454
total_rooms,0.022317
oceanProx_<1H OCEAN,0.012465


In [None]:
sc.stop()