In [21]:
from pyspark.sql import SparkSession

# Step 1: Create the SparkSession Object

In [22]:
spark = SparkSession.builder.appName('linear_regression').getOrCreate()

# Step 2: Read the Dataset

In [23]:
df = spark.read.csv('Linear_regression_dataset.csv', inferSchema=True, header=True)

# Step 3: Exploratory Data Analysis

In [24]:
def shape(df):
    # (row, columns)
    return (df.count(), len(df.columns))

In [25]:
shape(df)

(1232, 6)

In [26]:
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [27]:
# Describe the dataset.
df.describe().show(3)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|            var_1|            var_2|             var_3|               var_4|               var_5|             output|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|             1232|             1232|              1232|                1232|                1232|               1232|
|   mean|715.0819805194806|715.0819805194806| 80.90422077922078|  0.3263311688311693| 0.25927272727272715|0.39734172077922014|
| stddev| 91.5342940441652|93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
only showing top 3 rows



In [28]:
# View the first 3 rows.
df.head(3)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417)]

In [29]:
# Find the covariance between var_1 and output.
from pyspark.sql.functions import corr

# var_1 seems to be most strongly correlated with the output column.
df.select(*[corr(f'var_{i+1}', 'output') 
            for i in range(5)]).show()

+-------------------+-------------------+-------------------+-------------------+-------------------+
|corr(var_1, output)|corr(var_2, output)|corr(var_3, output)|corr(var_4, output)|corr(var_5, output)|
+-------------------+-------------------+-------------------+-------------------+-------------------+
| 0.9187399607627283|0.43652698913681093| 0.4014958408311139| 0.7909100204842113| 0.7904806260381185|
+-------------------+-------------------+-------------------+-------------------+-------------------+



# Step 4: Feature Engineering

Spark's `VectorAssembler` combines all input features and create only a single feature that captures the input values for that row.

In [30]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [31]:
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [32]:
vec_assembler = VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],
                                outputCol='features')
features_df = vec_assembler.transform(df)
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [33]:
# To build the linear regression model, we just need the features and output column.
model_df = features_df.select('features', 'output')
model_df.show(5)

+--------------------+------+
|            features|output|
+--------------------+------+
|[734.0,688.0,81.0...| 0.418|
|[700.0,600.0,94.0...| 0.389|
|[712.0,705.0,93.0...| 0.417|
|[734.0,806.0,69.0...| 0.415|
|[613.0,759.0,61.0...| 0.378|
+--------------------+------+
only showing top 5 rows



# Step 5: Splitting the dataset

In [34]:
train_df, test_df = model_df.randomSplit([.7, .3], seed=42)
shape(train_df), shape(test_df)

((855, 2), (377, 2))

# Step 6: Build and train linear regression model

In [35]:
from pyspark.ml.regression import LinearRegression

In [36]:
# Choose the "output" column as the label.
lin_reg = LinearRegression(labelCol='output')

# Fit the model to the train dataset.
lr_model = lin_reg.fit(train_df)

In [37]:
lr_model.coefficients

DenseVector([0.0003, 0.0001, 0.0002, -0.6292, 0.4952])

In [38]:
lr_model.intercept

0.17778601018255152

In [39]:
training_predictions = lr_model.evaluate(train_df)
training_predictions.r2

0.8749889968158224

# Step 7: Evaluate Linear Regression Model on Test Data

In [40]:
test_predictions = lr_model.evaluate(test_df)
test_predictions.r2, test_predictions.meanSquaredError

(0.8547754016343414, 0.00015731589773936277)