<a href="https://colab.research.google.com/github/anshupandey/Machine_Learning_Training/blob/master/Apache_Spark_Linear_Regression_for_housing_prices_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Apache Spark

In [1]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com] [Connected to clou                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
                                                                               Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [3 InRelease 15.6 kB/88.7 kB 18%] [Waiting for headers] [Waiting for headers0% [1 InRelease gpgv 1,581 B] [3 InRelease 15.6 kB/88.7 kB 18%] [Waiting for he                                                                               Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [1 InRelease gpgv 1,5

In [2]:
# Seetting up the paths
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

## Starting with SPark

In [4]:
# create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [5]:
train_data_path = "/content/sample_data/california_housing_train.csv"
test_data_path = "/content/sample_data/california_housing_test.csv"

train_data = spark.read.csv(train_data_path,header=True,inferSchema=True)
test_data = spark.read.csv(test_data_path,header=True,inferSchema=True)

In [6]:
print(train_data.count())
print(test_data.count())

17000
3000


In [7]:
train_data.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|              20.0|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
|  -114.58|   33.63|    

## Creating a feature assembler

In [9]:
feature_names = train_data.columns[2:-1]
feature_names

['housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [10]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_names,outputCol="Feature")
train_data2 = assembler.transform(train_data)
train_data2.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|             Feature|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|[15.0,5612.0,1283...|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|[19.0,7650.0,1901...|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|[17.0,720.0,174.0...|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|[14.0,1501.0,337....|

## Traiing a Machine Learning algorithm

In [11]:
from pyspark.ml.regression import LinearRegression
algo = LinearRegression(featuresCol='Feature',labelCol="median_house_value")
model = algo.fit(train_data2)

In [12]:
test_data2 = assembler.transform(test_data)
evaluation_summary = model.evaluate(test_data2)

In [13]:
evaluation_summary.r2

0.545883534674609

In [14]:
evaluation_summary.meanSquaredError

5808966246.710222