In [20]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation

In [21]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'regression'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [22]:
# load the data
fil = '../data/housing.csv'
schem = StructType([StructField('longitude', FloatType()), StructField('latitude', FloatType()),
                    StructField('housing_median_age', FloatType()), StructField('total_rooms', FloatType()),
                    StructField('total_bedrooms', FloatType()), StructField('population', FloatType()),
                    StructField('households', FloatType()), StructField('median_income', FloatType()),
                    StructField('median_house_value', FloatType()), StructField('ocean_proximity', StringType())])

house = spark.read.format('csv').options(header=True).schema(schem).load(fil)
cnt = house.count()
print('%d records'%cnt)
house.show(truncate=False)

20640 records
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|-122.23  |37.88   |41.0              |880.0      |129.0         |322.0     |126.0     |8.3252       |452600.0          |NEAR BAY       |
|-122.22  |37.86   |21.0              |7099.0     |1106.0        |2401.0    |1138.0    |8.3014       |358500.0          |NEAR BAY       |
|-122.24  |37.85   |52.0              |1467.0     |190.0         |496.0     |177.0     |7.2574       |352100.0          |NEAR BAY       |
|-122.25  |37.85   |52.0              |1274.0     |235.0         |558.0     |219.0     |5.6431       |341300.0          |NEAR BAY       |
|-122.25  |37.85   |

In [23]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:house.select(colm).where(col(colm).isNull()).count() for colm in house.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in house.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
house = house.dropna(how='any')

# talk some more
print('%d records'%house.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,total_bedrooms,207.0,0.010029,FloatType
1,longitude,0.0,0.0,FloatType
2,latitude,0.0,0.0,FloatType
3,housing_median_age,0.0,0.0,FloatType
4,total_rooms,0.0,0.0,FloatType
5,population,0.0,0.0,FloatType
6,households,0.0,0.0,FloatType
7,median_income,0.0,0.0,FloatType
8,median_house_value,0.0,0.0,FloatType
9,ocean_proximity,0.0,0.0,StringType


20433 records


In [24]:
''' OHE the ocean_proximity var '''
# first review the distribution
display(house.groupBy('ocean_proximity').count().toPandas())

# first need to string index ...
indx = StringIndexer(inputCol='ocean_proximity', outputCol='oceanProx_int')
house = indx.fit(house).transform(house)

# ... then we can encode
ohe = OneHotEncoder(inputCol='oceanProx_int', outputCol='oceanProx')
house = ohe.fit(house).transform(house).drop('oceanProx_int')

# talk
house.show(truncate=False)

Unnamed: 0,ocean_proximity,count
0,ISLAND,5
1,NEAR OCEAN,2628
2,NEAR BAY,2270
3,<1H OCEAN,9034
4,INLAND,6496


+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|oceanProx    |
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------------+
|-122.23  |37.88   |41.0              |880.0      |129.0         |322.0     |126.0     |8.3252       |452600.0          |NEAR BAY       |(4,[3],[1.0])|
|-122.22  |37.86   |21.0              |7099.0     |1106.0        |2401.0    |1138.0    |8.3014       |358500.0          |NEAR BAY       |(4,[3],[1.0])|
|-122.24  |37.85   |52.0              |1467.0     |190.0         |496.0     |177.0     |7.2574       |352100.0          |NEAR BAY       |(4,[3],[1.0])|
|-122.25  |37.85   |52.0              |1274.0     |235.0         |558.0     |219.0     |

In [25]:
# prepare the response
house = house.withColumnRenamed('median_house_value', 'label')

In [26]:
''' prepare the features '''
# get the features
features = [c for c in house.columns if c not in (['ocean_proximity', 'label'])]
print(features)

# create the features vector
assr = VectorAssembler(inputCols=features, outputCol='features')
house = assr.transform(house)

# talk
display(house.limit(10).toPandas())
house.select('features', 'label').show(truncate=False)
house.select('features').take(1)
print('First row features = %s'%house.select('features').take(1)[0])

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'oceanProx']


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,label,ocean_proximity,oceanProx,features
0,-122.230003,37.880001,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.2300033569336, 37.880001068115234, 41.0,..."
1,-122.220001,37.860001,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.22000122070312, 37.86000061035156, 21.0,..."
2,-122.239998,37.849998,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.23999786376953, 37.849998474121094, 52.0..."
3,-122.25,37.849998,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.25, 37.849998474121094, 52.0, 1274.0, 23..."
4,-122.25,37.849998,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.25, 37.849998474121094, 52.0, 1627.0, 28..."
5,-122.25,37.849998,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.25, 37.849998474121094, 52.0, 919.0, 213..."
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.25, 37.84000015258789, 52.0, 2535.0, 489..."
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.25, 37.84000015258789, 52.0, 3104.0, 687..."
8,-122.260002,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.26000213623047, 37.84000015258789, 42.0,..."
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY,"(0.0, 0.0, 0.0, 1.0)","[-122.25, 37.84000015258789, 52.0, 3549.0, 707..."


+--------------------+--------+
|            features|   label|
+--------------------+--------+
|[-122.23000335693...|452600.0|
|[-122.22000122070...|358500.0|
|[-122.23999786376...|352100.0|
|[-122.25,37.84999...|341300.0|
|[-122.25,37.84999...|342200.0|
|[-122.25,37.84999...|269700.0|
|[-122.25,37.84000...|299200.0|
|[-122.25,37.84000...|241400.0|
|[-122.26000213623...|226700.0|
|[-122.25,37.84000...|261100.0|
|[-122.26000213623...|281500.0|
|[-122.26000213623...|241800.0|
|[-122.26000213623...|213500.0|
|[-122.26000213623...|191300.0|
|[-122.26000213623...|159200.0|
|[-122.26000213623...|140000.0|
|[-122.26999664306...|152500.0|
|[-122.26999664306...|155500.0|
|[-122.26000213623...|158700.0|
|[-122.26999664306...|162900.0|
+--------------------+--------+
only showing top 20 rows

First row features = [-122.2300033569336,37.880001068115234,41.0,880.0,129.0,322.0,126.0,8.325200080871582,0.0,0.0,0.0,1.0]


In [None]:
''' split for cross-val '''
trainPerc = 0.7
randSeed = 42
trainASD, testASD = asdML.randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
trainASD.select('Case_No').show()
print('Testing Cases')
testASD.select('Case_No').show()

### Logistic Regression

In [None]:
''' try logistic regression - using predefined train/test split '''
# create objects
#auc = BinaryClassificationEvaluator(metricName='areaUnderROC')
acc = MulticlassClassificationEvaluator(metricName='accuracy')
logreg = LogisticRegression()

# train & eval
fitModel = logreg.fit(trainASD.select('features', 'label'))
trainRes = fitModel.evaluate(trainASD.select('features', 'label'))
trainAcc = acc.evaluate(trainRes.predictions)

# now evaluate test accuracy
testRes = fitModel.transform(testASD.select('features', 'label'))
testAcc = acc.evaluate(testRes)

print('Train Accuracy = %0.3f, Test Accurcy = %0.3f'%(trainAcc, testAcc))

In [None]:
''' now use the builtin cross-validator '''
#estimator
logreg = LogisticRegression()
# parameters grid
params = (ParamGridBuilder().addGrid(logreg.threshold, [0.4, 0.5, 0.6]).addGrid(logreg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build())
# cross-validator
cv = CrossValidator(estimator=logreg, estimatorParamMaps=params, evaluator=acc, numFolds=5)

# run
fitModel = cv.fit(trainASD.select('features', 'label'))

In [None]:
bestModel = fitModel.bestModel

# best model parameters
print('Threshold = %0.3f, Elastic Net = %0.3f'%(bestModel.getThreshold(), bestModel.getElasticNetParam()))

# show performance of best model
testAcc = acc.evaluate(bestModel.transform(testASD.select('features', 'label')))
print('Best Model Test Accuracy = %0.3f'%testAcc)

In [None]:
# only available for logistic regression
summ = bestModel.summary
summ.predictions.describe().show()
summ.objectiveHistory

print('Summary Information %s'%[am for am in dir(summ) if am[0] != '_'])

### Decision Tree

In [None]:
''' now use the builtin cross-validator '''
#estimator
estim = DecisionTreeClassifier()
# parameters grid
params = (ParamGridBuilder().addGrid(estim.maxBins, [20, 40, 80, 100])\
              .addGrid(estim.maxDepth, [5, 10, 30]).build())
# cross-validator
cv = CrossValidator(estimator=estim, estimatorParamMaps=params, evaluator=acc, numFolds=5)

# run
fitModel = cv.fit(trainASD.select('features', 'label'))

In [None]:
bestModel = fitModel.bestModel

# best model parameters
print('Max Quant Bins = %0.3f, Max Depth = %0.3f'%(bestModel.getMaxBins(), bestModel.getMaxDepth()))

# show performance of best model
testAcc = acc.evaluate(bestModel.transform(testASD.select('features', 'label')))
print('Best Model Test Accuracy = %0.3f'%testAcc)

# view feature importances
imports = bestModel.featureImportances.toArray()
for (col, imp) in zip(inputColumns, imports):
    print('%s feature importance = %0.3f'%(col, imp))

In [None]:
# what is available in the model object
print([i for i in dir(bestModel) if i[0] != '_'])

### Random Forest

In [None]:
''' now use the builtin cross-validator '''
#estimator
estim = RandomForestClassifier(numTrees=20)
# parameters grid
params = (ParamGridBuilder().addGrid(estim.maxBins, [20, 40, 80, 100])\
              .addGrid(estim.maxDepth, [5, 10, 30]).build())
# cross-validator
cv = CrossValidator(estimator=estim, estimatorParamMaps=params, evaluator=acc, numFolds=5)

# run
fitModel = cv.fit(trainASD.select('features', 'label'))

In [None]:
bestModel = fitModel.bestModel

# best model parameters
print('Max Quant Bins = %0.3f, Max Depth = %0.3f'%(bestModel.getMaxBins(), bestModel.getMaxDepth()))

# show performance of best model
testAcc = acc.evaluate(bestModel.transform(testASD.select('features', 'label')))
print('Best Model Test Accuracy = %0.3f'%testAcc)

# view feature importances
imports = bestModel.featureImportances.toArray()
for (col, imp) in zip(inputColumns, imports):
    print('%s feature importance = %0.3f'%(col, imp))

In [None]:
# what is available in the model object
print([i for i in dir(bestModel) if i[0] != '_'])

In [19]:
sc.stop()