In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType,BooleanType,DateType, IntegerType
from pyspark.sql.functions import to_timestamp, upper, col
from pyspark.sql.functions import rank
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, Tokenizer, StopWordsRemover, Word2Vec, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType, DoubleType 
from pyspark.ml.linalg import Vectors

from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.clustering import KMeans

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.appName('BigDataProject').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [23]:
raw_df = spark.read.csv("/user/abhiniti/project/mod_df_3.9.csv", inferSchema=True, header=True)

In [43]:
df = raw_df

In [46]:
# Convert city, state and zip to numeric categorical
for y in ['city', 'state', 'zip5']:
    print(y)
    indexer = StringIndexer(inputCol=str(y), outputCol=str(y)+"Index")
    ohe = OneHotEncoder(inputCol = str(y)+"Index", outputCol = str(y)+"Vector")
    df = indexer.fit(df).transform(df)
    df = ohe.transform(df)

city
state
zip5


In [47]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- physical_address: string (nullable = true)
 |-- zip5: integer (nullable = true)
 |-- property_type: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- sale_date: timestamp (nullable = true)
 |-- sale_price: double (nullable = true)
 |-- num_units: integer (nullable = true)
 |-- year_built: integer (nullable = true)
 |-- num_sales: integer (nullable = true)
 |-- zip_num_schools: integer (nullable = true)
 |-- city_num_schools: integer (nullable = true)
 |-- zip_st_ratio: double (nullable = true)
 |-- city_st_ratio: double (nullable = true)
 |-- CPIHOSNS: double (nullable = true)
 |-- ZVHI: double (nullable = true)
 |-- Median_Income: double (nullable = true)
 |-- sum(count): integer (nullable = true)
 |-- avg(Hospital overall rating): double (nullable = true)
 |-- CHILDREN_CNT: integer (nullable = true)
 |-- CHILDREN_RATE: double (nullable = true)
 |-- CHRONIC DISEASE_CNT: integer (nullabl

### Modeling

In [9]:
from pyspark.ml.feature import VectorAssembler

In [48]:
lin_reg_df = df.filter(df['prev_sale_price'] > 0).na.fill(0)
lin_reg_df = lin_reg_df.withColumn('last_sale_days', datediff('sale_date',col("prev_sale_date")))
lin_reg_df = lin_reg_df.withColumn('last_sale_days', datediff('sale_date',col("prev_sale_date")))
lin_reg_df = lin_reg_df.withColumn('year_built', F.regexp_replace('year_built', ',', '').cast('decimal(12,2)'))
lin_reg_df = lin_reg_df.withColumn('sale_price', F.regexp_replace('sale_price', ',', '').cast('decimal(12,2)')).na.fill(0)
lin_reg_df = lin_reg_df.filter(col('sale_price') <= 1000000).filter(col('sale_price') >= 100000)

In [53]:
lin_reg_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- physical_address: string (nullable = true)
 |-- zip5: integer (nullable = true)
 |-- property_type: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- sale_date: timestamp (nullable = true)
 |-- sale_price: decimal(12,2) (nullable = true)
 |-- num_units: integer (nullable = true)
 |-- year_built: decimal(12,2) (nullable = true)
 |-- num_sales: integer (nullable = true)
 |-- zip_num_schools: integer (nullable = true)
 |-- city_num_schools: integer (nullable = true)
 |-- zip_st_ratio: double (nullable = false)
 |-- city_st_ratio: double (nullable = false)
 |-- CPIHOSNS: double (nullable = false)
 |-- ZVHI: double (nullable = false)
 |-- Median_Income: double (nullable = false)
 |-- sum(count): integer (nullable = true)
 |-- avg(Hospital overall rating): double (nullable = false)
 |-- CHILDREN_CNT: integer (nullable = true)
 |-- CHILDREN_RATE: double (nullable = false)
 |-- CHRONIC DISEASE_C

In [54]:
# predictors = ['year_built', 'prev_sale_price', 'last_sale_days', 'CPIHOSNS', 'cityVector', 'stateVector', 'zip5Vector']
# lin_reg_df = lin_reg_df.select(predictors).printSchema()
# lin_reg_df.select(predictors).show(1,vertical=True)

In [80]:
predictors = ['year_built', 'prev_sale_price', 'last_sale_days', 'CPIHOSNS']
vectorAssembler = VectorAssembler(inputCols = predictors, outputCol = 'features')
vinput_data = vectorAssembler.transform(lin_reg_df)
vinput_data = vinput_data.select(['features', 'sale_price'])
vinput_data.show(1)

+--------------------+----------+
|            features|sale_price|
+--------------------+----------+
|[2003.0,7303573.0...| 358000.00|
+--------------------+----------+
only showing top 1 row



In [81]:
# Train/Test Split
train_df, test_df = vinput_data.randomSplit([.7,.3],seed=1234)

### Gradient Boost

In [57]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'sale_price', maxIter=5)
gbt_model = gbt.fit(train_df)

In [58]:
gbt_predictions_train = gbt_model.transform(train_df)
gbt_predictions_test = gbt_model.transform(test_df)

In [59]:
## Train data
gbt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions_train)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

gbt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="r2")
rmse = gbt_evaluator.evaluate(gbt_predictions_train)
print("R Squared (R2) on train data = %g" % rmse)

Root Mean Squared Error (RMSE) on train data = 130769
R Squared (R2) on train data = 0.577343


In [91]:
## Train data
gbt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="mae")
rmse = gbt_evaluator.evaluate(gbt_predictions_train)
print("Mean Absolute Error (MAE) on train data = %g" % rmse)

gbt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="mse")
rmse = gbt_evaluator.evaluate(gbt_predictions_train)
print("Mean Squared Error (MSE) on train data = %g" % rmse)

Mean Absolute Error (MAE) on train data = 92808.5
Mean Squared Error (MSE) on train data = 1.71214e+10


In [60]:
## Test data
gbt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions_test)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="r2")
rmse = gbt_evaluator.evaluate(gbt_predictions_test)
print("R Squared (R2) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 130836
R Squared (R2) on test data = 0.578828


In [61]:
dir(gbt_model)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__metaclass__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_call_java',
 '_clear',
 '_copyValues',
 '_copy_params',
 '_create_from_java_class',
 '_create_params_from_java',
 '_defaultParamMap',
 '_dummy',
 '_empty_java_param_map',
 '_from_java',
 '_java_obj',
 '_make_java_param_pair',
 '_new_java_array',
 '_new_java_obj',
 '_paramMap',
 '_params',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_to_java',
 '_transfer_param_map_from_java',
 '_transfer_param_map_to_java',
 '_transfer_params_from_java',
 '_transfer_params_to_java',
 '_transform',
 'cacheNodeIds',
 'checkpointInterval',
 'copy',
 'evaluateEachIte

In [62]:
# gbt_model.featureImportances
# gbt_model.stages[-1].featureImportances

In [63]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))


In [64]:
ExtractFeatureImp(gbt_model.featureImportances, train_df, "features")

Unnamed: 0,idx,name,score
1,1,prev_sale_price,0.319923
3,3,CPIHOSNS,0.301491
4,4,cityVector_LOS ANGELES,0.097351
2,2,last_sale_days,0.079623
0,0,year_built,0.058031
10,10,cityVector_LANCASTER,0.030725
9,9,cityVector_PALMDALE,0.027285
7,7,cityVector_MORENO VALLEY,0.021760
17,17,cityVector_HEMET,0.021156
683,683,zip5Vector_90266,0.009718


### Linear Regression

In [82]:
train_df_lin = train_df.withColumnRenamed('sale_price', 'label')

In [83]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, 
regParam=0.0)
model = glr.fit(train_df_lin)

In [84]:
# summary = model.summary
# print("Coefficients: " + str(model.coefficients))
# print("P Values: " + str(summary.pValues))
# dir(model.summary)

In [85]:
model.summary.pValues

[2.903939311238446e-10, 0.0, 0.0, 0.0, 2.234500584563648e-08]

In [87]:
linreg_predictions_train = model.transform(train_df)
linreg_predictions_test = model.transform(test_df)

In [None]:
## with city, state, zip

In [74]:
## Train data
linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = linreg_evaluator.evaluate(linreg_predictions_train)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="r2")
rmse = linreg_evaluator.evaluate(linreg_predictions_train)
print("R Squared (R2) on train data = %g" % rmse)

Root Mean Squared Error (RMSE) on train data = 140034
R Squared (R2) on train data = 0.515326


In [75]:
## Test data
linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = linreg_evaluator.evaluate(linreg_predictions_test)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="r2")
rmse = linreg_evaluator.evaluate(linreg_predictions_test)
print("R Squared (R2) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 140339
R Squared (R2) on test data = 0.51542


In [86]:
### without city, state, zip

In [88]:
## Train data
linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = linreg_evaluator.evaluate(linreg_predictions_train)
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="r2")
rmse = linreg_evaluator.evaluate(linreg_predictions_train)
print("R Squared (R2) on train data = %g" % rmse)

Root Mean Squared Error (RMSE) on train data = 187996
R Squared (R2) on train data = 0.127233


In [89]:
## Test data
linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = linreg_evaluator.evaluate(linreg_predictions_test)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

linreg_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="r2")
rmse = linreg_evaluator.evaluate(linreg_predictions_test)
print("R Squared (R2) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 187625
R Squared (R2) on test data = 0.128691
