In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()

In [23]:
df = spark.read.csv('cereal.csv',header=True,inferSchema=True)
df.show(3)

+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|             name|mfr|type|calories|protein|fat|sodium|fiber|carbo|sugars|potass|vitamins|shelf|weight|cups|   rating|
+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
|        100% Bran|  N|   C|      70|      4|  1|   130| 10.0|  5.0|     6|   280|      25|    3|   1.0|0.33|68.402973|
|100% Natural Bran|  Q|   C|     120|      3|  5|    15|  2.0|  8.0|     8|   135|       0|    3|   1.0| 1.0|33.983679|
|         All-Bran|  K|   C|      70|      4|  1|   260|  9.0|  7.0|     5|   320|      25|    3|   1.0|0.33|59.425505|
+-----------------+---+----+--------+-------+---+------+-----+-----+------+------+--------+-----+------+----+---------+
only showing top 3 rows



In [25]:
df = df.drop('sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'shelf','mfr', 'type')

In [26]:
print(df.columns)

['name', 'calories', 'protein', 'fat', 'weight', 'cups', 'rating']


In [27]:
df.show(3)

+-----------------+--------+-------+---+------+----+---------+
|             name|calories|protein|fat|weight|cups|   rating|
+-----------------+--------+-------+---+------+----+---------+
|        100% Bran|      70|      4|  1|   1.0|0.33|68.402973|
|100% Natural Bran|     120|      3|  5|   1.0| 1.0|33.983679|
|         All-Bran|      70|      4|  1|   1.0|0.33|59.425505|
+-----------------+--------+-------+---+------+----+---------+
only showing top 3 rows



In [29]:
from pyspark.ml.feature import VectorAssembler
vec = VectorAssembler(inputCols=['calories', 'protein', 'fat', 'weight', 'cups'],outputCol='vectored_cols')

In [31]:
output = vec.transform(df)
output.show(2)

+-----------------+--------+-------+---+------+----+---------+--------------------+
|             name|calories|protein|fat|weight|cups|   rating|       vectored_cols|
+-----------------+--------+-------+---+------+----+---------+--------------------+
|        100% Bran|      70|      4|  1|   1.0|0.33|68.402973|[70.0,4.0,1.0,1.0...|
|100% Natural Bran|     120|      3|  5|   1.0| 1.0|33.983679|[120.0,3.0,5.0,1....|
+-----------------+--------+-------+---+------+----+---------+--------------------+
only showing top 2 rows



In [33]:
finaldf = output.select('rating','vectored_cols')
finaldf.show(2)

+---------+--------------------+
|   rating|       vectored_cols|
+---------+--------------------+
|68.402973|[70.0,4.0,1.0,1.0...|
|33.983679|[120.0,3.0,5.0,1....|
+---------+--------------------+
only showing top 2 rows



In [35]:
train,test = finaldf.randomSplit([0.70,0.30])

In [36]:
from pyspark.ml.regression import LinearRegression

model = LinearRegression(featuresCol='vectored_cols',labelCol='rating')
model = model.fit(finaldf)

In [37]:
model.coefficients

DenseVector([-0.4581, 6.4265, -3.0893, 6.8656, -2.9917])

In [38]:
model.intercept

73.78966034272224

In [40]:
pred = model.evaluate(test)
pred.predictions.show(5)

+---------+--------------------+-----------------+
|   rating|       vectored_cols|       prediction|
+---------+--------------------+-----------------+
|22.396513|[110.0,1.0,1.0,1....|30.60737400704666|
|23.804043|[110.0,1.0,1.0,1....|31.35528949071795|
|28.592785|[140.0,3.0,1.0,1....|33.14951359303556|
|28.742414|[110.0,1.0,0.0,1....|32.70945398307532|
|30.450843|[130.0,3.0,2.0,1....|34.42083218609504|
+---------+--------------------+-----------------+
only showing top 5 rows



In [41]:
pred.meanSquaredError

45.84308515927681