In [1]:
import os, sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as fun
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPAK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.master("local[2]").appName("ML_LinearRegression").getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.csv("headbrain.csv", header=True)

In [6]:
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age Range: string (nullable = true)
 |-- Head Size(cm^3): string (nullable = true)
 |-- Brain Weight(grams): string (nullable = true)



In [7]:
df.show(5)

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     1|        1|           4512|               1530|
|     1|        1|           3738|               1297|
|     1|        1|           4261|               1335|
|     1|        1|           3777|               1282|
|     1|        1|           4177|               1590|
+------+---------+---------------+-------------------+
only showing top 5 rows



In [8]:
df.count(), len(df.columns)

(237, 4)

In [9]:
# Statistical description
df.select("Head Size(cm^3)", "Brain Weight(grams)").describe().show()

+-------+------------------+-------------------+
|summary|   Head Size(cm^3)|Brain Weight(grams)|
+-------+------------------+-------------------+
|  count|               237|                237|
|   mean|3633.9915611814345|  1282.873417721519|
| stddev| 365.2614224198132| 120.34044578645734|
|    min|              2720|               1012|
|    max|              4747|                955|
+-------+------------------+-------------------+



In [10]:
# EDA - Exploratory Data Analysis

In [11]:
df.groupBy("Gender").count().show()

+------+-----+
|Gender|count|
+------+-----+
|     1|  134|
|     2|  103|
+------+-----+



In [12]:
# Count/Check missing values in data

In [18]:
df.agg(*[fun.count(fun.when(fun.isnull(column),column)).alias(column) for column in df.columns]).show()

+------+---------+---------------+-------------------+
|Gender|Age Range|Head Size(cm^3)|Brain Weight(grams)|
+------+---------+---------------+-------------------+
|     0|        0|              0|                  0|
+------+---------+---------------+-------------------+



In [19]:
df.withColumn("Brain Weight(grams)", col("Brain Weight(grams)").cast("Integer"))

DataFrame[Gender: string, Age Range: string, Head Size(cm^3): string, Brain Weight(grams): int]

In [20]:
# convert all columns into integer
df = df.select(*(col(c).cast("Integer").alias(c) for c in df.columns))

In [21]:
df

DataFrame[Gender: int, Age Range: int, Head Size(cm^3): int, Brain Weight(grams): int]

In [22]:
features = df.drop(*["Gender","Age Range"])

In [23]:
features.show(5)

+---------------+-------------------+
|Head Size(cm^3)|Brain Weight(grams)|
+---------------+-------------------+
|           4512|               1530|
|           3738|               1297|
|           4261|               1335|
|           3777|               1282|
|           4177|               1590|
+---------------+-------------------+
only showing top 5 rows



In [24]:
# drop target column from dataset

X = features.drop("Brain Weight(grams)")
assembler = VectorAssembler(inputCols=X.columns, outputCol='features')

In [25]:
output = assembler.transform(features).select('features','Brain Weight(grams)')

In [26]:
output.show(5)

+--------+-------------------+
|features|Brain Weight(grams)|
+--------+-------------------+
|[4512.0]|               1530|
|[3738.0]|               1297|
|[4261.0]|               1335|
|[3777.0]|               1282|
|[4177.0]|               1590|
+--------+-------------------+
only showing top 5 rows



In [28]:
regression = LinearRegression(featuresCol='features', labelCol='Brain Weight(grams)')
regression_model = regression.fit(output)

In [29]:
# slope
regression_model.coefficients

DenseVector([0.2634])

In [30]:
# intercept
regression_model.intercept

325.5734210494322

In [31]:
regression_model.summary.rootMeanSquaredError

72.12062137837093

In [33]:
regression_model.summary.r2

0.6393117199570006