In [2]:
from google.colab import drive
drive.mount('/content.gdrive')

Mounted at /content.gdrive


In [3]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 60kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 34.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=fdb55f51df9fc84da765f7ab9adfcb790303d48f0bccb74465716d08fdd6dc34
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [5]:
# assume /content/drive is mounted automatically
import sys
nb_path = "/content/drive/My Drive/Colab Notebooks"
if nb_path not in sys.path:
  sys.path.insert(5, nb_path)

# Call pip install on 1st time only
#!pip install pyspark --target="{nb_path}"
import pyspark

In [6]:
#create sparksession object
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lin_reg').getOrCreate()

In [7]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [8]:
#Load the dataset
df=spark.read.csv('/content/Linear_regression_dataset.csv',inferSchema=True,header=True)

In [9]:
#validate the size of data
print((df.count(), len(df.columns)))

(1232, 6)


In [10]:
#explore the data
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [11]:
#view statistical measures of data 
df.describe().show(5,False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [12]:
#import corr function from pyspark functions
from pyspark.sql.functions import corr

In [13]:
# check for correlation
df.select(corr('var_1','var_2')).show()

+------------------+
|corr(var_1, var_2)|
+------------------+
|0.3801386662491575|
+------------------+



In [14]:
#import vectorassembler to create dense vectors
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [15]:
#select the columns to create input vector
df.columns


['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [16]:
#create the vector assembler 
vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [17]:
#transform the values
features_df=vec_assmebler.transform(df)

In [18]:
#validate the presence of dense vectors 
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [19]:
#view the details of dense vector
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [20]:
#create data containing input features and output column
model_df=features_df.select('features','output')

In [21]:
#size of model df
print((model_df.count(), len(model_df.columns)))

(1232, 2)


In [23]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [24]:
train_df.describe().show()

+-------+-------------------+
|summary|             output|
+-------+-------------------+
|  count|                847|
|   mean|0.39792561983471036|
| stddev|0.03324087023468722|
|    min|              0.311|
|    max|              0.491|
+-------+-------------------+



In [26]:
#Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output')

In [27]:
#fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [28]:
lr_model.intercept

0.18925366978382055

In [29]:
print(lr_model.coefficients)

[0.0003446158045290719,5.242645803594545e-05,0.00017141598758795015,-0.6411870757820619,0.4607673574698688]


In [30]:
training_predictions=lr_model.evaluate(train_df)

In [31]:
training_predictions.meanSquaredError

0.00015419967481956147

In [32]:
training_predictions.r2

0.8602822010737656

In [33]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [34]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
|-0.01293466760616...|
|0.007390792573481297|
|-8.12504188299312...|
|-0.00653994951760...|
|0.008241031760976125|
|0.004439831487794865|
|-0.01218376981680...|
|-0.00184845145731...|
|-0.01136696743357...|
|-3.92323399864613...|
+--------------------+
only showing top 10 rows



In [35]:
#coefficient of determination value for model
test_results.r2

0.8884225703337666

In [36]:
test_results.rootMeanSquaredError

0.011119256312808162

In [37]:
test_results.meanSquaredError

0.00012363786094992415