In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType,BooleanType,DateType, IntegerType
from pyspark.sql.functions import to_timestamp, upper, col
from pyspark.sql.functions import rank
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, Tokenizer, StopWordsRemover, Word2Vec, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType, DoubleType 
from pyspark.ml.linalg import Vectors

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.appName('BigDataProject').getOrCreate()

#change configuration settings on Spark 
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

# SECTION 1 - Read Data

In [3]:
raw_df = spark.read.csv("/user/efischbein/data/group_project/us_housing_prices", inferSchema=True, header=True)
Public_Schools = spark.read.csv("/user/efischbein/data/group_project/Public_Schools.csv", \
    inferSchema=True, header=True)
CPI = spark.read.csv("/user/efischbein/data/group_project/CPIHOSNS.csv", inferSchema=True, header=True)
ZVHI = spark.read.csv("/user/efischbein/data/group_project/ZHVI_cln.csv", inferSchema=True, header=True)
Income = spark.read.csv("/user/efischbein/data/group_project/Census_Income", inferSchema=True, header=True)
pre_2000 = spark.read.csv("/user/efischbein/data/group_project/county_pre_2000_data_cln.csv", header=True)
hospital_ratings = spark.read.csv('/user/efischbein/data/group_project/hospital/Hospital_General_Information.csv', \
    header = True)
hospital = spark.read.csv('/user/efischbein/data/group_project/hospital/hospitals.csv', header = True)
crime = spark.read.csv('/user/efischbein/data/group_project/crime_data_w_population_and_crime_rate.csv',header = True)
CountyCrossWalk_Zillow = spark.read.csv('/user/efischbein/data/group_project/zillow_econ/CountyCrossWalk_Zillow.csv',\
    header = True)

# SECTION 2 - Data Cleanup
1. drop extra columns (seller_name, buyer_name, source_url, book, page)
2. Dates - limited from 1970-2021
3. Sale Price - not null, greater than 0
4. City, State - remove null
5. Num. Units -  must be 1
6. Property Type - remove condos, mobile homes
7. Num. Sales - must be greater than 1

In [None]:
#Data Cleanup - drop extra columns
df = raw_df.drop("seller_name", "buyer_name", "source_url", "book", "page")

#Data Cleanup - Dates: 1970 - 2021
df = df.withColumn('sale_date', F.to_date(F.unix_timestamp('sale_date', 'yyyy-MM-dd').cast('timestamp')))
df = df.filter((F.year(col("sale_date"))<=2021) & (F.year(col("sale_date"))>=1970))

#Data Cleanup - Sale Price
df = df.withColumn('sale_price', df.sale_price.cast('float'))
df = df.filter(col("sale_price").isNotNull()).filter(col("sale_price")>0)

#Data Cleanup - remove null cities & states
df = df.filter((col("city").isNotNull()) & (col("state").isNotNull()) & \
    (col("physical_address").isNotNull()) & (col("zip5").isNotNull()))

#Data Cleanup - View number of units, want to only be 1
df = df.filter(col('num_units')==1)

#Data Cleanup - filter out condos, mobile homes, rentals
df = df.where(~ col("property_type").like("%CONDO%"))
df = df.where(~ col("property_type").like("%MOBILE%HOME%"))
df = df.where(~ col("property_type").like("%RENTALS%"))

#Data Cleanup - filter dataframe based on window function to only show properties with only 1 sale
windowSpec = Window().partitionBy(['city','state', 'physical_address', 'zip5']).rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
df = df.withColumn("num_sales", F.count(col('city')).over(windowSpec)).filter(col('num_sales') > 1)


In [5]:
df.describe()

summary,state,zip5,physical_address,city,county,property_id,property_type,sale_price,num_units,year_built,num_sales
count,4475696,4475696.0,4475696,4475696,4475696,4409835,4475696,4475696.0,4475696.0,4309742.0,4475696.0
mean,,78918.27365598557,0.0,,,3.052766666829644E16,,659333.053226582,1.0,1971.436376005803,3.7982615441263214
stddev,,25360.736490400435,0.0,,,8.301656420043518...,,9080130.91264043,0.0,26.98456629791822,2.76297146640094
min,CA,2467.0,0,ACTON,BRONX COUNTY,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,Wake,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0


In [6]:
# row count = 4,475,696

# SECTION 3 - Join Predictive Data Features

### Public Schools

In [None]:
#Public schools - First predictor variable - raw count by zip code
num_schools_zip = Public_Schools.groupby('ZIP').count()

#join predictor to base table
df = df.join(num_schools_zip, df['zip5'] == num_schools_zip['ZIP'], 'left').\
    select(df["*"],num_schools_zip["count"]).na.fill(0).withColumnRenamed('count','zip_num_schools')

#Public schools - Second predictor variable - number of schools by city
num_schools_city = Public_Schools.groupby('city', 'state').count()

#join predictor to base table
df = df.join(num_schools_city, (df['city'] == num_schools_city['city']) & \
     (df['state'] == num_schools_city['state']), 'left').\
     select(df["*"],num_schools_city["count"]).na.fill(0).withColumnRenamed('count','city_num_schools')

#Public schools - Third predictor variable - Student Teacher Ratio by Zip
st_ratio_zip = Public_Schools.filter((col('ENROLLMENT') != -1) & (col('FT_TEACHER') != -1))
st_ratio_zip = st_ratio_zip.groupby('ZIP').agg((F.sum(st_ratio_zip.ENROLLMENT) / F.sum(st_ratio_zip.FT_TEACHER)).alias('zip_st_ratio'))
#join predictor to base table
df = df.join(st_ratio_zip, df['zip5'] == st_ratio_zip['ZIP'], 'left').\
    select(df["*"],st_ratio_zip["zip_st_ratio"])
    
#Public schools - Fourth predictor variable - Student Teacher Ratio by City
st_ratio_city = Public_Schools.filter((col('ENROLLMENT') != -1) & (col('FT_TEACHER') != -1))
st_ratio_city = st_ratio_city.groupby('city', 'state').agg((F.sum(st_ratio_city.ENROLLMENT) / F.sum(st_ratio_city.FT_TEACHER)).alias('city_st_ratio'))
#join predictor to base table
df = df.join(st_ratio_city, (df['city'] == st_ratio_city['city']) & \
    (df['state'] == st_ratio_city['state']), 'left').\
    select(df["*"],st_ratio_city["city_st_ratio"]).na.fill(0)


In [8]:
df.describe()

summary,state,zip5,physical_address,city,county,property_id,property_type,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio
count,4475696,4475696.0,4475696,4475696,4475696,4409835,4475696,4475696.0,4475696.0,4309742.0,4475696.0,4475696.0,4475696.0,4475696.0,4475696.0
mean,,78918.27365598557,0.0,,,3.052766666829643...,,659333.053226582,1.0,1971.436376005803,3.7982615441263214,9.988614508223971,120.0276138057634,22.884661353843978,21.87006647716683
stddev,,25360.73649040043,0.0,,,8.301656420043518...,,9080130.912640434,0.0,26.984566297918217,2.7629714664009404,6.147428170273346,207.15393653895927,6.533804150641172,5.769448024528116
min,CA,2467.0,0,ACTON,BRONX COUNTY,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,Wake,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125


### Housing CPI

In [None]:
# Create date columns
CPI = CPI.withColumn('month', F.month(col('DATE')))
CPI = CPI.withColumn('year', F.year(col('DATE')))
#join predictor to base table
df = df.join(CPI, (F.month(df['sale_date']) == CPI['month']) & \
    (F.year(df['sale_date']) == CPI['year']), 'left').\
    select(df["*"],CPI["CPIHOSNS"])


In [10]:
df.describe()

summary,state,zip5,physical_address,city,county,property_id,property_type,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio,CPIHOSNS
count,4475696,4475696.0,4475696,4475696,4475696,4409835,4475696,4475696.0,4475696.0,4309742.0,4475696.0,4475696.0,4475696.0,4475696.0,4475696.0,4475696.0
mean,,78918.27365598557,0.0,,,3.052766666829642...,,659333.053226582,1.0,1971.436376005803,3.7982615441263214,9.988614508223971,120.0276138057634,22.884661353843978,21.870066477166844,185.14688744052464
stddev,,25360.736490400417,0.0,,,8.301656420043518...,,9080130.912640426,0.0,26.984566297918217,2.762971466400941,6.147428170273351,207.15393653895936,6.533804150641171,5.769448024528117,52.39264001611369
min,CA,2467.0,0,ACTON,BRONX COUNTY,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0,35.1
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,Wake,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125,287.511


### ZVHI

In [None]:
#Create date column
ZVHI = ZVHI.withColumn('Date', F.to_date(F.unix_timestamp('Date', 'MM/dd/yyyy').cast('timestamp')))

#filter out nationwide records
ZVHI = ZVHI.filter((ZVHI['RegionType'] == 'Msa') & (ZVHI['ZVHI'] > 0)) \
    .groupBy('StateName', 'Date').agg(F.mean(col('ZVHI')) \
    .alias('ZVHI')).orderBy('StateName', 'Date')

    #add month, year columns
ZVHI = ZVHI.withColumn('month', F.month(col('DATE')))
ZVHI = ZVHI.withColumn('year', F.year(col('DATE')))

#ZVHI - join on city name
df = df.join(ZVHI, \
    (F.month(df['sale_date']) == ZVHI['month']) & (F.year(df['sale_date']) == ZVHI['year']) \
    & (df['state'] == ZVHI['StateName']), 'left') \
    .na.fill(0).drop("year", "month", "Date")

In [12]:
df.describe()

summary,state,zip5,physical_address,city,county,property_id,property_type,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio,CPIHOSNS,StateName,ZVHI
count,4475696,4475696.0,4475696,4475696,4475696,4409835,4475696,4475696.0,4475696.0,4309742.0,4475696.0,4475696.0,4475696.0,4475696.0,4475696.0,4475696.0,2819236,4475696.0
mean,,78918.27365598557,0.0,,,3.052766666829642...,,659333.053226582,1.0,1971.436376005803,3.7982615441263214,9.988614508223971,120.0276138057634,22.884661353843995,21.87006647716684,185.1468874405245,,191447.41764069453
stddev,,25360.736490400424,0.0,,,8.301656420043516...,,9080130.912640426,0.0,26.984566297918207,2.762971466400941,6.147428170273344,207.15393653895933,6.533804150641172,5.769448024528117,52.39264001611371,,171136.8298631848
min,CA,2467.0,0,ACTON,BRONX COUNTY,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0,35.1,CA,0.0
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,Wake,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125,287.511,WA,592158.2647058824


### Income

In [8]:
#limit columns
income_df = Income.select('Year', 'Geographic Area Name', 'Households!!Estimate!!Median income (dollars)')

#clean zip column
income_df = income_df.withColumn('Zip', F.substring(col('Geographic Area Name'), -5, 5))

pre_2000 = pre_2000.filter(col('Region').contains('County'))
pre_2000 = pre_2000.withColumn('County', F.upper(F.substring_index(col('Region'), ' County', 1)))

In [9]:
#Income - join on city name
df = df.join(income_df, \
             (F.year(df['sale_date']) == income_df['Year']) & (df['zip5'] == income_df['Zip']), 'left')#.select(df["*"],income_df["Geographic Area Name", "Households!!Estimate!!Median income (dollars)"])
df = df.drop("Year", "Zip")

#pre_2000 - join on county name
df = df.drop("year").join(pre_2000, \
    (F.year(df['sale_date']) == pre_2000['Year']) \
    & (F.upper(df.county) == F.upper(pre_2000['County'])), 'left')\
    .select(df["*"],pre_2000["Income"]).na.fill(0)

#create final income field combining the datasets
df = df.withColumn('Median_Income', F.when(F.year(col('sale_date')) < 2011, \
    col('Income')).otherwise(col('Households!!Estimate!!Median income (dollars)')))

#drop extra columns
df = df.drop('StateName', 'Date', 'Year', 'Geographic Area Name', \
    'Households!!Estimate!!Median income (dollars)','Zip','Region', 'County', 'Income','month')

In [15]:
df.describe()

summary,state,zip5,physical_address,city,property_id,property_type,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio,CPIHOSNS,ZVHI,Median_Income
count,4475884,4475884.0,4475884,4475884,4410023,4475884,4475884.0,4475884.0,4309930.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,3804447
mean,,78916.14287523984,0.0,,3.052758767985918E16,,659313.0378253771,1.0,1971.4367920128636,3.798283646314337,9.988419717758548,120.02769977952958,22.88439441066564,21.869880044100768,185.1464236792117,191442.8396283284,44238.60159023118
stddev,,25362.334859900817,0.0,,8.301647132342966...,,9079940.797368867,0.0,26.984566023983696,2.7630195816278063,6.147420924603237,207.15002579764788,6.533843240611535,5.76940177494549,52.39194903214921,171135.30435094322,21392.846931779677
min,CA,2467.0,0,ACTON,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0,35.1,0.0,-
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125,287.511,592158.2647058824,99974


### Hospitals

In [10]:
hospital_ratings = hospital_ratings.select('Hospital Name', 'Hospital overall rating').dropna()\
    .where(hospital_ratings['Hospital overall rating'] != 'Not Available')

#RENAME THE COLUMN FOR JOIN 
hospital = hospital.withColumn('Hospital Name',hospital.NAME)
hospitals_with_ratings =hospital.join(hospital_ratings,['Hospital Name'],'left')\
    .na.fill('0',subset = ['Hospital overall rating'])

hospital_final = hospitals_with_ratings\
    .groupBy(['ZIP', 'CITY','STATE','COUNTY','TYPE','Hospital overall rating','OWNER','STATUS'])\
    .count().orderBy('count',ascending = [0])\
    .withColumn('Hospital overall rating', hospitals_with_ratings['Hospital overall rating'].cast('int'))

hos1 = hospital_final.groupBy('STATE').agg(F.sum('count'),F.avg('Hospital overall rating'))\
    .orderBy('sum(count)', ascending = [0])
hos2 = hospital_final.groupBy('STATE').pivot('TYPE')\
    .agg(F.sum('count').alias("CNT"),F.avg('Hospital overall rating').alias('RATE')).na.fill(0)
hos3 = hospital_final.groupBy('STATE').pivot('OWNER')\
    .agg(F.sum('count').alias("CNT"),F.avg('Hospital overall rating').alias('RATE')).na.fill(0)\
    .drop('REHABILITATION_CNT','REHABILITATION_RATE')

hos1 = hos1.join(hos2,['STATE'], 'left')
hos1 = hos1.join(hos3,['STATE'], 'left')
hos1 = hos1.withColumn('STATE', hos1.STATE).drop('null','NOT AVAILABLE').withColumnRenamed('STATE', 'hos_state')

In [11]:
df = df.join(hos1, df['state'] == hos1['hos_state'], 'left').na.fill(0)

In [25]:
df.describe()

summary,state,zip5,physical_address,city,property_id,property_type,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio,CPIHOSNS,ZVHI,Median_Income,hos_state,sum(count),avg(Hospital overall rating),CHILDREN_CNT,CHILDREN_RATE,CHRONIC DISEASE_CNT,CHRONIC DISEASE_RATE,CRITICAL ACCESS_CNT,CRITICAL ACCESS_RATE,GENERAL ACUTE CARE_CNT,GENERAL ACUTE CARE_RATE,LONG TERM CARE_CNT,LONG TERM CARE_RATE,MILITARY_CNT,MILITARY_RATE,PSYCHIATRIC_CNT,PSYCHIATRIC_RATE,REHABILITATION_CNT,REHABILITATION_RATE,SPECIAL_CNT,SPECIAL_RATE,WOMEN_CNT,WOMEN_RATE,null_CNT,null_RATE,GOVERNMENT - DISTRICT/AUTHORITY_CNT,GOVERNMENT - DISTRICT/AUTHORITY_RATE,GOVERNMENT - FEDERAL_CNT,GOVERNMENT - FEDERAL_RATE,GOVERNMENT - LOCAL_CNT,GOVERNMENT - LOCAL_RATE,GOVERNMENT - STATE_CNT,GOVERNMENT - STATE_RATE,NON-PROFIT_CNT,NON-PROFIT_RATE,NOT AVAILABLE_CNT,NOT AVAILABLE_RATE,PROPRIETARY_CNT,PROPRIETARY_RATE
count,4475884,4475884.0,4475884,4475884,4410023,4475884,4475884.0,4475884.0,4309930.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,3804447,4475884,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0
mean,,78916.14287523984,0.0,,3.052758767986266...,,659313.0378253771,1.0,1971.4367920128636,3.798283646314337,9.988419717758548,120.02769977952958,22.884394410653798,21.869880044214035,185.1464236791977,191442.8396284907,44238.60159023118,,513.3634774717128,0.9281435403014806,9.72036764134191,0.113752277762337,0.0,0.0,0.4731668649142828,0.0254137196135086,402.013628145859,1.2037889381030664,8.54314857132133,0.0,16.49976540946995,0.0262505256371944,61.50138564806416,0.0026308586423259,13.26280104667592,0.0,0.4440582016870857,0.0200927906085144,0.9051559423792036,0.0,0.0,0.0,45.06537591233374,1.3427358904121534,16.49976540946995,0.0262505256371944,34.39919466188132,0.5741037416975374,9.984819311671169,0.0,252.87892827428053,1.056729539409112,16.11915389228139,0.7197783199638614,138.4162400097947,0.8507864485236358
stddev,,25362.334859901526,0.0,,8.301647132342844...,,9079940.797368908,0.0,26.98456602397921,2.763019581627812,6.147420924603054,207.15002579764544,6.533843240611339,5.769401774945538,52.39194903214816,171135.30435094197,21392.846931779804,,121.61247601932722,0.124418487029161,3.031644871003749,0.3175101881668521,0.0,0.0,2.8167417499602565,0.1773375985256965,103.25321379771012,0.2331144309055492,4.881975299940025,0.0,3.3310024782837653,0.0732715818846582,15.622546233698657,0.0178708066350284,3.98740238739761,0.0,1.1547926141377645,0.1403177635689833,0.3550130585349907,0.0,0.0,0.0,12.071168164781104,0.2722165028837632,3.3310024782837653,0.0732715818846582,15.22747726512238,0.298849335411029,3.642897964969944,0.0,64.95621906530774,0.2004497150518417,5.344541241584331,0.3208008271063676,37.491103939070015,0.1689565288896726
min,CA,2467.0,0,ACTON,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0,35.1,0.0,-,CA,134.0,0.7734375,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.9830508474576272,2.0,0.0,6.0,0.0,10.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0,56.0,0.8653846153846154,0.0,0.0,4.0,0.0
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125,287.511,592158.2647058824,99974,WA,571.0,1.2985611510791366,11.0,1.0,0.0,0.0,40.0,1.263157894736842,453.0,1.8941798941798944,22.0,0.0,18.0,0.2307692307692307,68.0,0.125,22.0,0.0,5.0,1.0,2.0,0.0,0.0,0.0,51.0,2.074074074074074,18.0,0.2307692307692307,42.0,1.3333333333333333,26.0,0.0,285.0,1.8421052631578947,27.0,1.5,165.0,1.1823899371069182


### Crime

In [12]:
# create state column
state = crime.withColumn('ColCommasRemoved',F.split(crime.county_name,','))\
    .select('county_name',F.rtrim(F.col('ColCommasRemoved')[1]))
state = state.withColumn('state',state['rtrim(ColCommasRemoved[1])']).drop('rtrim(ColCommasRemoved[1])')
crime = crime.join(state, 'county_name','left')

# join with zillow data
CountyCrossWalk_Zillow = CountyCrossWalk_Zillow.withColumn('FIPS_ST',CountyCrossWalk_Zillow['StateFIPS'])\
    .withColumn('FIPS_CTY',CountyCrossWalk_Zillow['CountyFIPS'])
crime = crime.join(CountyCrossWalk_Zillow,['FIPS_ST','FIPS_CTY'],'left')

crime = crime.select(['CountyName','state','crime_rate_per_100000'
        ,'MURDER','RAPE','ROBBERY','AGASSLT','BURGLRY','LARCENY','MVTHEFT','ARSON','population']).dropna()
crime = crime.groupBy('state').agg(F.count('MURDER'),F.count('RAPE'),F.count('ROBBERY')
    ,F.count('AGASSLT'),F.count('BURGLRY'),F.count('MVTHEFT'),F.count('ARSON')
    ,F.sum('population')
    ,F.avg('crime_rate_per_100000'))


In [13]:
df = df.join(crime,'state','left').na.fill(0)

In [28]:
df.describe()

summary,state,zip5,physical_address,city,property_id,property_type,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio,CPIHOSNS,ZVHI,Median_Income,hos_state,sum(count),avg(Hospital overall rating),CHILDREN_CNT,CHILDREN_RATE,CHRONIC DISEASE_CNT,CHRONIC DISEASE_RATE,CRITICAL ACCESS_CNT,CRITICAL ACCESS_RATE,GENERAL ACUTE CARE_CNT,GENERAL ACUTE CARE_RATE,LONG TERM CARE_CNT,LONG TERM CARE_RATE,MILITARY_CNT,MILITARY_RATE,PSYCHIATRIC_CNT,PSYCHIATRIC_RATE,REHABILITATION_CNT,REHABILITATION_RATE,SPECIAL_CNT,SPECIAL_RATE,WOMEN_CNT,WOMEN_RATE,null_CNT,null_RATE,GOVERNMENT - DISTRICT/AUTHORITY_CNT,GOVERNMENT - DISTRICT/AUTHORITY_RATE,GOVERNMENT - FEDERAL_CNT,GOVERNMENT - FEDERAL_RATE,GOVERNMENT - LOCAL_CNT,GOVERNMENT - LOCAL_RATE,GOVERNMENT - STATE_CNT,GOVERNMENT - STATE_RATE,NON-PROFIT_CNT,NON-PROFIT_RATE,NOT AVAILABLE_CNT,NOT AVAILABLE_RATE,PROPRIETARY_CNT,PROPRIETARY_RATE,count(MURDER),count(RAPE),count(ROBBERY),count(AGASSLT),count(BURGLRY),count(MVTHEFT),count(ARSON),sum(population),avg(crime_rate_per_100000)
count,4475884,4475884.0,4475884,4475884,4410023,4475884,4475884.0,4475884.0,4309930.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,3804447,4475884,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0,4475884.0
mean,,78916.14287523984,0.0,,3.052758767986070...,,659313.0378253771,1.0,1971.4367920128636,3.798283646314337,9.988419717758548,120.02769977952958,22.884394410656185,21.86988004422112,185.14642367919225,191442.83962850823,44238.60159023118,,513.3634774717128,0.9281435403014806,9.72036764134191,0.113752277762337,0.0,0.0,0.4731668649142828,0.0254137196135086,402.013628145859,1.2037889381030662,8.54314857132133,0.0,16.49976540946995,0.0262505256371944,61.50138564806416,0.0026308586423259,13.26280104667592,0.0,0.4440582016870857,0.0200927906085144,0.9051559423792036,0.0,0.0,0.0,45.06537591233374,1.3427358904121531,16.49976540946995,0.0262505256371944,34.39919466188132,0.5741037416975374,9.984819311671169,0.0,252.87892827428053,1.0567295394091123,16.11915389228139,0.7197783199638614,138.4162400097947,0.8507864485236358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stddev,,25362.33485990081,0.0,,8.301647132342892...,,9079940.797369111,0.0,26.984566023979863,2.763019581627796,6.147420924603444,207.15002579763728,6.5338432406116285,5.769401774945314,52.391949032147785,171135.30435094365,21392.8469317793,,121.61247601932725,0.124418487029161,3.0316448710037487,0.3175101881668521,0.0,0.0,2.8167417499602565,0.1773375985256964,103.25321379771012,0.2331144309055492,4.8819752999400245,0.0,3.3310024782837653,0.0732715818846581,15.622546233698657,0.0178708066350284,3.98740238739761,0.0,1.1547926141377645,0.1403177635689833,0.3550130585349907,0.0,0.0,0.0,12.071168164781104,0.2722165028837633,3.3310024782837653,0.0732715818846581,15.227477265122378,0.2988493354110289,3.642897964969944,0.0,64.95621906530772,0.2004497150518417,5.344541241584331,0.3208008271063677,37.491103939070015,0.1689565288896726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,CA,2467.0,0,ACTON,0000025,"""""""LO17 PLZ """"""""C...",1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0,35.1,0.0,-,CA,134.0,0.7734375,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.9830508474576272,2.0,0.0,6.0,0.0,10.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0,56.0,0.8653846153846154,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,WA,98282.0,YOUNG AVENUE,ZEPHYRHILLS,U-35-28-17-0CH-00...,WELLINGTON WOODS ...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125,287.511,592158.2647058824,99974,WA,571.0,1.2985611510791366,11.0,1.0,0.0,0.0,40.0,1.263157894736842,453.0,1.8941798941798944,22.0,0.0,18.0,0.2307692307692307,68.0,0.125,22.0,0.0,5.0,1.0,2.0,0.0,0.0,0.0,51.0,2.074074074074074,18.0,0.2307692307692307,42.0,1.3333333333333333,26.0,0.0,285.0,1.8421052631578947,27.0,1.5,165.0,1.1823899371069182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create historical sales

In [14]:
#add purchase ranking to df
df = df.withColumn("rank", dense_rank().\
    over(Window.partitionBy('city','state', 'physical_address', 'zip5', 'property_type').\
    orderBy('sale_date')))

#find previous purchase
ranked = df.select('city','state', 'physical_address', 'zip5', 'property_type','sale_date', 'sale_price').withColumn("rank", dense_rank().\
    over(Window.partitionBy('city','state', 'physical_address', 'zip5', 'property_type').\
    orderBy('sale_date')))
ranked = ranked.withColumn("rank", col('rank') + 1)
ranked = ranked.withColumnRenamed('sale_date', 'prev_sale_date').withColumnRenamed('sale_price', 'prev_sale_price')

In [15]:
df = df.join(ranked,['city','state', 'physical_address', 'zip5', 'property_type', 'rank'],'left').na.fill(0)

In [31]:
df.describe()

summary,city,state,physical_address,zip5,property_type,rank,property_id,sale_price,num_units,year_built,num_sales,zip_num_schools,city_num_schools,zip_st_ratio,city_st_ratio,CPIHOSNS,ZVHI,Median_Income,hos_state,sum(count),avg(Hospital overall rating),CHILDREN_CNT,CHILDREN_RATE,CHRONIC DISEASE_CNT,CHRONIC DISEASE_RATE,CRITICAL ACCESS_CNT,CRITICAL ACCESS_RATE,GENERAL ACUTE CARE_CNT,GENERAL ACUTE CARE_RATE,LONG TERM CARE_CNT,LONG TERM CARE_RATE,MILITARY_CNT,MILITARY_RATE,PSYCHIATRIC_CNT,PSYCHIATRIC_RATE,REHABILITATION_CNT,REHABILITATION_RATE,SPECIAL_CNT,SPECIAL_RATE,WOMEN_CNT,WOMEN_RATE,null_CNT,null_RATE,GOVERNMENT - DISTRICT/AUTHORITY_CNT,GOVERNMENT - DISTRICT/AUTHORITY_RATE,GOVERNMENT - FEDERAL_CNT,GOVERNMENT - FEDERAL_RATE,GOVERNMENT - LOCAL_CNT,GOVERNMENT - LOCAL_RATE,GOVERNMENT - STATE_CNT,GOVERNMENT - STATE_RATE,NON-PROFIT_CNT,NON-PROFIT_RATE,NOT AVAILABLE_CNT,NOT AVAILABLE_RATE,PROPRIETARY_CNT,PROPRIETARY_RATE,count(MURDER),count(RAPE),count(ROBBERY),count(AGASSLT),count(BURGLRY),count(MVTHEFT),count(ARSON),sum(population),avg(crime_rate_per_100000),prev_sale_price
count,4495744,4495744,4495744,4495744.0,4495744,4495744.0,4429454,4495744.0,4495744.0,4329731.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,3809908,4495744,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0,4495744.0
mean,,,0.0,78715.09374777567,,2.33411021624007,3.152302631300765...,657133.0282649546,1.0,1971.4249742074048,3.819723053625829,9.97939517908493,119.63939472532245,22.85417420154577,21.842681405718626,185.22427818843903,191218.26730470825,44242.33275819979,,512.7071018723486,0.929555430723759,9.711722242191726,0.1174897858952822,0.0,0.0,0.4738830769723543,0.0254563435504061,401.3237906784728,1.2065084902330472,8.599538140961762,0.0,16.484022221905875,0.0271130275142965,61.45890913717507,0.002633222243848...,13.298413566252885,0.0,0.4553889189420038,0.0201159140733992,0.9014338894741336,0.0,0.0,0.0,44.983007039546735,1.3458546882727174,16.484022221905875,0.0271130275142965,34.24986186935911,0.5717980369847105,9.964977320772713,0.0,252.3552566605216,1.059063090144617,16.16354000583663,0.7167772179608339,138.50643675440594,0.8520851247306207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,428270.206611186
stddev,,,0.0,25487.820339050708,,1.6753192617850934,8.416362845689014...,9060270.952274635,0.0,26.967715035431745,2.90070302018412,6.145088202804623,206.782162377122,6.570097807498317,5.774755704851367,52.35879100252017,170867.19585717705,21383.63395214622,,121.75794339179518,0.1260024752808694,3.028755644456468,0.3220030421730513,0.0,0.0,2.821837343777682,0.177475220586361,103.55072961648402,0.2363046502974651,4.9493076698276335,0.0,3.332438910131705,0.0743083943476272,15.60597609555769,0.017879040568963018,4.020364918716661,0.0,1.1651449071351825,0.1403968249598949,0.3592807944222246,0.0,0.0,0.0,12.11003733956422,0.2757476369159617,3.332438910131705,0.0743083943476272,15.35971207823892,0.3006706473465652,3.653705749731558,0.0,65.2940637666894,0.2031716149486637,5.381216133713823,0.3235555389375625,37.48648702430493,0.170202147996976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6861719.152129494
min,ACTON,CA,0,2467.0,"""""""LO17 PLZ """"""""C...",1.0,0000025,1.0,1.0,1776.0,2.0,0.0,0.0,0.0,0.0,35.1,0.0,-,CA,134.0,0.7734375,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.9830508474576272,2.0,0.0,6.0,0.0,10.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,2.0,0.0,56.0,0.8653846153846154,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,ZEPHYRHILLS,WA,YOUNG AVENUE,98282.0,WELLINGTON WOODS ...,106.0,U-35-28-17-0CH-00...,2800000000.0,1.0,2022.0,214.0,46.0,587.0,360.0,40.125,287.511,592158.2647058824,99974,WA,571.0,1.2985611510791366,11.0,1.0,0.0,0.0,40.0,1.263157894736842,453.0,1.8941798941798944,22.0,0.0,18.0,0.2307692307692307,68.0,0.125,22.0,0.0,5.0,1.0,2.0,0.0,0.0,0.0,51.0,2.074074074074074,18.0,0.2307692307692307,42.0,1.3333333333333333,26.0,0.0,285.0,1.8421052631578947,27.0,1.5,165.0,1.1823899371069182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1920010880.0


### Drop unnecessary columns

In [40]:
df = df.drop("property_id", "hos_state")

# SECTION 4 - Feature Engineering

### General feature creation 
1. sale_date to sale_month, sale_day, sale_year
2. prev_sale_date to prev_sale_month, prev_sale_day, prev_sale_year
3. Median_Income to float

In [43]:
#sale_date
df = df.withColumn('sale_month', F.month(col("sale_date")))\
    .withColumn('sale_day', date_format(col("sale_date"), "d"))\
    .withColumn('sale_year', F.year(col("sale_date")))
#prev_sale_date
df = df.withColumn('prev_sale_month', F.month(col("prev_sale_date")))\
    .withColumn('prev_sale_day', date_format(col("sale_date"), "d"))\
    .withColumn('prev_sale_year', F.year(col("prev_sale_date")))
#Median_Income
df = df.withColumn('Median_Income', df['Median_Income'].cast('float'))

### city, state and zip to categorical 

In [38]:
# Convert city, state and zip to numeric categorical
for y in ['city', 'state', 'zip5']:
    print(y)
    indexer = StringIndexer(inputCol=str(y), outputCol=str(y)+"Index")
    ohe = OneHotEncoder(inputCol = str(y)+"Index", outputCol = str(y)+"Vector")
    df = indexer.fit(df).transform(df)
    df = ohe.transform(df)

KeyboardInterrupt: 

In [37]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip5: string (nullable = true)
 |-- property_type: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- property_id: string (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- sale_price: float (nullable = false)
 |-- year_built: string (nullable = true)
 |-- num_sales: long (nullable = false)
 |-- zip_num_schools: long (nullable = true)
 |-- city_num_schools: long (nullable = true)
 |-- zip_st_ratio: double (nullable = false)
 |-- city_st_ratio: double (nullable = false)
 |-- CPIHOSNS: double (nullable = false)
 |-- ZVHI: double (nullable = false)
 |-- Median_Income: string (nullable = true)
 |-- hos_state: string (nullable = true)
 |-- sum(count): long (nullable = true)
 |-- avg(Hospital overall rating): double (nullable = false)
 |-- CHILDREN_CNT: long (nullable = true)
 |-- CHILDREN_RATE: double (nullable = false)
 |-- CRITICAL ACCESS_CNT: long (nullable = true)
 |-- CRITICAL 

### property_type clustering

In [None]:
## Create pipeline
# tokenization
tokenization = Tokenizer(inputCol="property_type", outputCol="p_words")
# stopwords
stopwords = StopWordsRemover(inputCol= "p_words", outputCol="p_filtered")
# some form of dimensionality reduction, word2vec
word2vec = Word2Vec(vectorSize=5, minCount=0, inputCol="p_filtered", outputCol="propertyVector")

pipeline = Pipeline().setStages([
    tokenization,
    stopwords,
    word2vec
])

In [None]:
# fit_pipeline = pipeline.fit(df)
df = pipeline.fit(df).transform(df)
df

In [None]:
# Kmeans
kmeans = KMeans(featuresCol='propertyVector', predictionCol='propertyClusters', k=10, seed=123, maxIter=5)
k_mod = kmeans.fit(df)
df = k_mod.transform(df)

### Feature Reduction

In [None]:
# Remove variables with low stddev
summary = df.describe().toPandas()\
    .set_index("summary").T\
    .sort_values(by="stddev")
summary['stddev'] = pd.to_numeric(summary['stddev'])
summary

In [None]:
drop = summary[summary['stddev'] == 0].index
drop

In [None]:
df = df.drop('physical_address', 'count(MURDER)', 'GOVERNMENT - STATE_RATE',
       'null_RATE', 'null_CNT', 'WOMEN_RATE', 'REHABILITATION_RATE',
       'avg(crime_rate_per_100000)', 'LONG TERM CARE_RATE',
       'CHRONIC DISEASE_RATE', 'CHRONIC DISEASE_CNT', 'count(ROBBERY)',
       'count(RAPE)', 'sum(population)', 'num_units', 'count(MVTHEFT)',
       'count(ARSON)', 'count(AGASSLT)', 'count(BURGLRY)')

# SECTION 5 - Some data viz/exploration pre model

### Feature Correlation

In [None]:
#correlation with y variable (sale_price)
y_corr = pd.DataFrame(columns=['Columns', 'Correlation Value'])
for x in df.columns:
    try:
        print(x)
        temp = pd.DataFrame([[x, df.corr("sale_price", x)]], columns=['Columns', 'Correlation Value'])
        y_corr = y_corr.append(temp)
    except:
        pass
y_corr

In [None]:
#all x variable correlation
x_corr = pd.DataFrame({"Variables" : df.columns})
for x in df.columns:
    temp2 = pd.DataFrame()
    for y in df.columns:
        print(str(x) + " and " + str(y))
        try:
            temp = pd.DataFrame([[df.corr(x,y)]], columns=[x])
#             temp = pd.DataFrame([["CORR"]], columns=[x])
            temp2 = temp2.append(temp)
        except:
            temp = pd.DataFrame([["N/A"]], columns=[x])
            temp2 = temp2.append(temp)
    x_corr = pd.concat([x_corr,temp2.reset_index().drop(columns="index")], axis=1)
x_corr

### Y variable analysis

In [None]:
print(df['sale_price'].mean())
print(df['sale_price'].min())
print(df['sale_price'].max())

In [None]:
## sale price over time
# sale price vs year_built
# sale price vs sale_year

# SECTION 6 - Modeling

In [None]:
df.drop("city", "state", "zip5", "property_type", "property_id", "sale_price").columns

In [16]:
predictors = ("prev_sale_price", "zip_st_ratio")

In [19]:
input_data = df #mod_df.sample(0.05)

vectorAssembler = VectorAssembler(inputCols = predictors, outputCol = 'features')
vinput_data = vectorAssembler.transform(input_data)
vinput_data = vinput_data.select(['features', 'sale_price'])
# vinput_data.show(3)

In [20]:
# Train/Test Split
train_df, test_df = vinput_data.randomSplit([.7,.3],seed=1234)

### Linear Regression
elasticNetParam = 1, then Lasso <br>
regParam = 1, then Ridge <br>

In [21]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='sale_price', maxIter=5, regParam=1, elasticNetParam=0)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.174983366984032,9418.7880375339]
Intercept: 364139.54091635754


In [22]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 8939274.922702
r2: 0.019213


In [23]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","sale_price","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="sale_price",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

Py4JJavaError: An error occurred while calling o798.showString.
: org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:150)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:375)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:144)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeBroadcast$1.apply(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:140)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:136)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenOuter(BroadcastHashJoinExec.scala:282)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:104)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:65)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:85)
	at org.apache.spark.sql.execution.FilterExec.doConsume(basicPhysicalOperators.scala:206)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:362)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:362)
	at org.apache.spark.sql.execution.FilterExec.doProduce(basicPhysicalOperators.scala:125)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.FilterExec.produce(basicPhysicalOperators.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:98)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:532)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:586)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.doExecute(SortMergeJoinExec.scala:150)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.doExecute(SortMergeJoinExec.scala:150)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.window.WindowExec.doExecute(WindowExec.scala:302)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.joins.SortMergeJoinExec.doExecute(SortMergeJoinExec.scala:150)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:379)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.SampleExec.inputRDDs(basicPhysicalOperators.scala:271)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:615)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3383)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2544)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2758)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [300 seconds]
	at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:223)
	at scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:227)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:220)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:146)
	... 313 more


### Linear Regression with GLM

In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, 
regParam=0.0)
model = glr.fit(dataset)
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))

### Decision Tree

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'sale_price', maxDepth=1)
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="sale_price", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

### Gradient Boosting

In [None]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'MV', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'MV', 'features').show(5)

In [None]:
gbt_evaluator = RegressionEvaluator(
    labelCol="MV", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)