In [None]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler, StringIndexer
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

In [None]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'cluster'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

In [None]:
''' get the data '''
# load the data
fil = '../data/credict_card_data.csv'
schem = StructType([StructField('CUST_ID', StringType()), StructField('BALANCE', FloatType()),
                    StructField('BALANCE_FREQUENCY', FloatType()), StructField('PURCHASES', FloatType()),
                    StructField('ONEOFF_PURCHASES', FloatType()), StructField('INSTALLMENTS_PURCHASES', FloatType()),
                    StructField('CASH_ADVANCE', FloatType()), StructField('PURCHASES_FREQUENCY', FloatType()),
                    StructField('ONEOFF_PURCHASES_FREQUENCY', FloatType()),
                    StructField('PURCHASES_INSTALLMENTS_FREQUENCY', FloatType()),
                    StructField('CASH_ADVANCE_FREQUENCY', FloatType()), StructField('CASH_ADVANCE_TRX', FloatType()),
                    StructField('PURCHASES_TRX', FloatType()), StructField('CREDIT_LIMIT', FloatType()),
                    StructField('PAYMENTS', FloatType()), StructField('MINIMUM_PAYMENTS', FloatType()),
                    StructField('PRC_FULL_PAYMENT', FloatType()), StructField('TENURE', IntegerType())])
cc = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# talk
cnt = cc.count()
print('%d records'%cc)
house.show(truncate=False)

### Data Prep

In [None]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:house.select(colm).where(col(colm).isNull()).count() for colm in house.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in house.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
house = house.dropna(how='any')

# talk some more
print('%d records'%house.count())

In [None]:
''' OHE the ocean_proximity var '''
# first review the distribution
tab = house.groupBy('ocean_proximity').count().toPandas().sort_values(by='ocean_proximity')
display(tab)

# first need to string index ...
indx = StringIndexer(inputCol='ocean_proximity', outputCol='oceanProx_int', stringOrderType='alphabetAsc')
house = indx.fit(house).transform(house)

# ... then we can encode
ohe = OneHotEncoder(inputCol='oceanProx_int', outputCol='oceanProx')
house = ohe.fit(house).transform(house).drop('oceanProx_int')

# make the OHE columns - last is excluded; when all are 0, it's the last
featOHE = ['oceanProx_%s'%c for c in tab.ocean_proximity.values[:-1]]
print(featOHE)

# talk
house.show(truncate=False)

In [None]:
# prepare the response
house = house.withColumnRenamed('median_house_value', 'label')

In [None]:
''' prepare the features '''
# get the features
features = [c for c in house.columns if c not in (['ocean_proximity', 'label', 'id'])]

# create & scale the features vector
assr = VectorAssembler(inputCols=features, outputCol='features_raw')
scalr = MinMaxScaler(inputCol='features_raw', outputCol='features')
pipe = Pipeline(stages=[assr, scalr]).fit(house)
house = pipe.transform(house).drop('features_raw')

# now update the features list with the ocean proximity OHE columns; this assumes
# oceanProx was the last column, which should be true
features = features[:-1] + featOHE
print(features)

# talk
display(house.limit(10).toPandas())
house.select('id', 'features', 'label').show(truncate=False)
house.select('features').take(1)
print('First row features = %s'%house.select('features').take(1)[0])

In [None]:
# check for multicollinearity
# high: total_bedrooms vs. households, population vs. households, 
corr = Correlation.corr(house, column='features', method='pearson')
corrdf = pd.DataFrame(index=features, data=corr.collect()[0][0].toArray(), columns=features)
display(corrdf)

## Modeling

In [None]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = house.select('id', 'label', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

In [None]:
''' set up the estimators & param grids '''
models = {}

'''# linear regression
linreg = LinearRegression()
params = (ParamGridBuilder().addGrid(linreg.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build())
paramNames = ['elasticnetparam']
models['linear regression'] = [linreg, params, paramNames, None, None]

# random forest
ranfor = RandomForestRegressor(numTrees=20)
params = (ParamGridBuilder().addGrid(ranfor.maxBins, [20, 40, 80, 100])\
              .addGrid(ranfor.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['random forest'] = [ranfor, params, paramNames, None, None]'''

# gradient boosting trees
gradbst = GBTRegressor(maxIter=20)
params = (ParamGridBuilder().addGrid(gradbst.maxBins, [20, 40, 80, 100])\
              .addGrid(gradbst.maxDepth, [5, 10, 30]).build())
paramNames = ['maxbins', 'maxdepth']
models['gradient boost'] = [gradbst, params, paramNames, None, None]

In [None]:
''' run the models '''
# number of cv folds
folds = 5
# define the evaulation function
evl = RegressionEvaluator(metricName='rmse')

# iterate over models
for (model, stuff) in models.items():
    print('Cross Validator: %s'%model)
    # execute
    cv = CrossValidator(estimator=stuff[0], estimatorParamMaps=stuff[1], evaluator=evl, numFolds=folds)
    fitModel = cv.fit(house.select('features', 'label'))
    # get the best
    bestModel = fitModel.bestModel
    # evaluate performance on the test set
    testRMSE = evl.evaluate(bestModel.transform(test.select('features', 'label')))
    print('\tBest Model Test RMSE = %0.3f'%testRMSE)    
    # get best parameters
    bestParams = bestModel.extractParamMap()
    for (key, val) in bestParams.items():
        for parm in stuff[2]:
            if parm in key.name.lower():
                print('\t%s = %0.2f'%(key, val))
                break
    # save stuff
    models[model][3] = fitModel
    models[model][4] = testRMSE

In [None]:
# look at linear regression coefficients
bm = models['linear regression'][3].bestModel
summ = bm.summary
summ.predictions.describe().withColumn('Diff', col('prediction') - col('label')).show(truncate=False)
print('Best Model RMSE = %0.3f'%summ.rootMeanSquaredError)

# get a nice model coefficients table
coefs = pd.concat([pd.DataFrame(index=['Intercept'], data=[bm.intercept], columns=['Coefficient']),
                   pd.DataFrame(index=features, data=bm.coefficients.toArray(), columns=['Coefficient'])])
coefs['Std. Error'] = bm.summary.coefficientStandardErrors
coefs['pValue'] = bm.summary.pValues
# make an absolute coef column temporarily for sorting
coefs['tmp'] = coefs['Coefficient'].abs() a
coefs.loc['Intercept', 'tmp'] = np.inf
coefs = coefs.sort_values(by='tmp', ascending=False).drop(columns='tmp')

# talk
display(coefs)

In [None]:
# view feature importances for random forest
imports = models['random forest'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
# view feature importances for gradient boost
imports = models['gradient boost'][3].bestModel.featureImportances.toArray()
imports = pd.DataFrame(index=features, data=imports, columns=['Importance']).sort_values(by='Importance', ascending=False, inplace=False)
display(imports)

In [None]:
sc.stop()