In [35]:
import findspark
findspark.init('/home/arjun/spark-2.4.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_housing').getOrCreate()

In [36]:
from pyspark.ml.regression import LinearRegression

In [37]:
Wholedata = spark.read.csv("USA_Housing.csv",inferSchema=True,header=True)

In [38]:
Wholedata.printSchema()

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)



In [39]:
Wholedata.show()

+----------------+------------------+------------------------+---------------------------+----------------+----------------+
| Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms| Area Population|           Price|
+----------------+------------------+------------------------+---------------------------+----------------+----------------+
|79545.4585743168|      5.6828613216|            7.0091881428|                       4.09|23086.8005026865|1059033.55787012|
|79248.6424548257|      6.0028998083|            6.7308210191|                       3.09|40173.0721736448|1505890.91484695|
|61287.0671786568|      5.8658898403|            8.5127274304|                       5.13|36882.1593997046|1058987.98787608|
| 63345.240046228|      7.1882360945|            5.5867286648|                       3.26|34310.2428309071|1260616.80662945|
| 59982.197225708|      5.0405545231|            7.8393877851|                       4.23|26354.1094721031| 630943.48933854|


In [40]:
for item in Wholedata.head():
    print (item)

79545.4585743168
5.6828613216
7.0091881428
4.09
23086.8005026865
1059033.55787012


In [41]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [42]:
Wholedata.drop('Address')

DataFrame[Avg Area Income: double, Avg Area House Age: double, Avg Area Number of Rooms: double, Avg Area Number of Bedrooms: double, Area Population: double, Price: double]

In [43]:
Wholedata.head(3)

[Row(Avg Area Income=79545.4585743168, Avg Area House Age=5.6828613216, Avg Area Number of Rooms=7.0091881428, Avg Area Number of Bedrooms=4.09, Area Population=23086.8005026865, Price=1059033.55787012),
 Row(Avg Area Income=79248.6424548257, Avg Area House Age=6.0028998083, Avg Area Number of Rooms=6.7308210191, Avg Area Number of Bedrooms=3.09, Area Population=40173.0721736448, Price=1505890.91484695),
 Row(Avg Area Income=61287.0671786568, Avg Area House Age=5.8658898403, Avg Area Number of Rooms=8.5127274304, Avg Area Number of Bedrooms=5.13, Area Population=36882.1593997046, Price=1058987.98787608)]

In [44]:
Wholedata.head()

Row(Avg Area Income=79545.4585743168, Avg Area House Age=5.6828613216, Avg Area Number of Rooms=7.0091881428, Avg Area Number of Bedrooms=4.09, Area Population=23086.8005026865, Price=1059033.55787012)

In [45]:
Wholedata.columns

['Avg Area Income',
 'Avg Area House Age',
 'Avg Area Number of Rooms',
 'Avg Area Number of Bedrooms',
 'Area Population',
 'Price']

In [46]:
assembler=VectorAssembler(inputCols=["Avg. Area Income",
 "Avg. Area House Age",
 "Avg. Area Number of Rooms",
 "Avg. Area Number of Bedrooms",
 "Area Population"],outputCol="features")

In [47]:
Wholedata.columns

['Avg Area Income',
 'Avg Area House Age',
 'Avg Area Number of Rooms',
 'Avg Area Number of Bedrooms',
 'Area Population',
 'Price']

In [51]:
assembler=VectorAssembler(
    inputCols=["Avg Area Income",
     "Avg Area House Age",
     "Avg Area Number of Rooms",
     "Avg Area Number of Bedrooms",
     'Area Population'],
    outputCol="features")
output = assembler.transform(Wholedata)

In [55]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[79545.4585743168...|
|[79248.6424548257...|
|[61287.0671786568...|
|[63345.240046228,...|
|[59982.197225708,...|
|[80175.7541594853...|
|[64698.4634278877...|
|[78394.3392775309...|
|[59927.6608133496...|
|[81885.9271840957...|
|[80527.4720829229...|
|[50593.6954970428...|
|[39033.8092369824...|
|[73163.6634410467...|
|[69391.3801843616...|
|[73091.8667458232...|
|[79706.9630576574...|
|[61929.0770180893...|
|[63508.19429943,5...|
|[62085.2764034049...|
+--------------------+
only showing top 20 rows



In [56]:
output.head(1)

[Row(Avg Area Income=79545.4585743168, Avg Area House Age=5.6828613216, Avg Area Number of Rooms=7.0091881428, Avg Area Number of Bedrooms=4.09, Area Population=23086.8005026865, Price=1059033.55787012, features=DenseVector([79545.4586, 5.6829, 7.0092, 4.09, 23086.8005]))]

In [57]:
dataForModel = output.select("features",'Price')

In [58]:
train_data,test_data = dataForModel.randomSplit([0.7,0.3])

In [60]:
test_data.describe().show()

+-------+------------------+
|summary|             Price|
+-------+------------------+
|  count|              1500|
|   mean|1217396.1772763303|
| stddev|355245.59928471537|
|    min|   31140.517620186|
|    max|  2271112.74382772|
+-------+------------------+



In [61]:
linearRegressionModel = LinearRegression(labelCol='Price')

In [63]:
Model = linearRegressionModel.fit(train_data)

In [64]:
test_results = Model.evaluate(test_data)

In [66]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -80911.43542419776|
|  66545.82573964333|
| 176645.02433646738|
| -72528.04471125765|
| 139152.29629855906|
| 147287.14958614786|
|  367406.5299528695|
| 121194.00343202672|
|-137919.75493268215|
| 111550.31972914038|
| -77378.93676722911|
| -62212.65374314861|
| -17809.74674211361|
|  33314.82992657641|
| -33091.82721239736|
| 100488.91140213516|
| -74041.00603261899|
|  145158.1578062838|
|  5281.638665992767|
|-121956.65671704535|
+-------------------+
only showing top 20 rows



In [79]:
print("Root Mean Squared Error: {}".format(test_results.rootMeanSquaredError))

Root Mean Squared Error: 102946.39467024198
