In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [7]:
data = spark.read.csv("homeprices",header=True,inferSchema=True)
data

DataFrame[area: int, price: int]

In [8]:
data.show()

+----+------+
|area| price|
+----+------+
|2600|550000|
|3000|565000|
|3200|610000|
|3600|680000|
|4000|725000|
|4500|775000|
|2200|500000|
|2300|510000|
|4300|750000|
+----+------+



In [9]:
data.printSchema()

root
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)



In [12]:
# Create a feature array by omitting last column
feature_cols = data.columns[:-1]
from pyspark.ml.feature import VectorAssembler
vect_assembler = VectorAssembler(inputCols=feature_cols,outputCol="features")

# Utilize Assembler Created above in order to add the feature Column

data_w_features = vect_assembler.transform(data) 


In [14]:
data_w_features.show()

+----+------+--------+
|area| price|features|
+----+------+--------+
|2600|550000|[2600.0]|
|3000|565000|[3000.0]|
|3200|610000|[3200.0]|
|3600|680000|[3600.0]|
|4000|725000|[4000.0]|
|4500|775000|[4500.0]|
|2200|500000|[2200.0]|
|2300|510000|[2300.0]|
|4300|750000|[4300.0]|
+----+------+--------+



In [15]:
# Select only features and Labels from previous dataset to build ML model
final_data = data_w_features.select("features","price")
final_data

Exception ignored in: <function JavaWrapper.__del__ at 0x0000000006E4F1E0>
Traceback (most recent call last):
  File "C:\spark-2.4.4-bin-hadoop2.7\python\pyspark\ml\wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'


DataFrame[features: vector, price: int]

In [27]:
from pyspark.ml.regression import LinearRegression
LinReg = LinearRegression(featuresCol="features",labelCol= "price")
model = LinReg.fit(final_data)

In [28]:
#test_data = spark.read.csv("area",header=True,inferSchema=True)

In [31]:
# feature_col = test_data.columns
# va = VectorAssembler(inputCols=feature_col,outputCol='features')
# test_data_w_features = va.transform(test_data)
# test_data_w_features.show()

In [32]:
# pred = model.evaluate(test_data_w_features)
# pred.predictions.show()

In [33]:
train_data,test_data = final_data.randomSplit([0.7,0.3])
train_data.describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|                 8|
|   mean|          644375.0|
| stddev|102275.45090168441|
|    min|            500000|
|    max|            775000|
+-------+------------------+



In [34]:
test_data.describe().show()

+-------+--------+
|summary|   price|
+-------+--------+
|  count|       1|
|   mean|510000.0|
| stddev|     NaN|
|    min|  510000|
|    max|  510000|
+-------+--------+



In [36]:
model= LinReg.fit(train_data)

In [37]:
pred = model.evaluate(test_data)
pred.predictions.show()

+--------+------+-----------------+
|features| price|       prediction|
+--------+------+-----------------+
|[2300.0]|510000|505007.9872204469|
+--------+------+-----------------+

