In [3]:
from matplotlib import pyplot as plt

In [4]:
import findspark
findspark.init()

In [5]:
import pyspark
import pandas as pd

In [6]:
l_file = pd.read_csv('../../datasets/iris.csv',sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])

In [8]:
sc = pyspark.SparkContext(appName="simple_linear_reg")

In [9]:
from pyspark.sql.types import *
from datetime import datetime

In [10]:
schema = StructType([StructField("sepal_length", FloatType(), True),StructField("sepal_width", FloatType(), True),
                    StructField("petal_length", FloatType(), True),StructField("petal_width", FloatType(), True),
                     StructField("target", StringType(), True)])

In [11]:
from pyspark.sql import SQLContext

In [12]:
sqlctx = SQLContext(sc)

In [13]:
my_rdd = sqlctx.createDataFrame(l_file,schema)

In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
assembler = VectorAssembler(
  inputCols=["sepal_length"],
  outputCol="features"
)

In [17]:
sepal_petal_model = my_rdd.select("sepal_length", "petal_length").dropna()

In [18]:
model_assembled = assembler.transform(sepal_petal_model)

In [19]:
(train, test) = model_assembled.randomSplit([0.7, 0.3])

In [20]:
from pyspark.ml.regression import LinearRegression

In [22]:
lr = LinearRegression(
  featuresCol="features",
  labelCol="petal_length",
  fitIntercept=True
)

In [23]:
lr_model = lr.fit(train)

In [24]:
lr_summary = lr_model.evaluate(test)

In [37]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))


Coefficients: [1.8678632639198458]
Intercept: -7.150974611260485


In [38]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.801866
r2: 0.790843


In [39]:
train.describe().show()

+-------+------------------+------------------+
|summary|      sepal_length|      petal_length|
+-------+------------------+------------------+
|  count|                97|                97|
|   mean| 5.840206170819469|3.7577319489311924|
| stddev|0.8391041421519595|1.7624459219776096|
|    min|               4.3|               1.0|
|    max|               7.7|               6.9|
+-------+------------------+------------------+



In [42]:
lr_preds = lr_model.transform(test)
lr_preds.select("prediction","petal_length","features").show(5)

+------------------+------------+-------------------+
|        prediction|petal_length|           features|
+------------------+------------+-------------------+
|1.0676239281201587|         1.3|[4.400000095367432]|
|1.6279823728961462|         1.6|[4.699999809265137]|
|1.8147694118214188|         1.4|[4.800000190734863]|
|1.8147694118214188|         1.6|[4.800000190734863]|
|1.8147694118214188|         1.9|[4.800000190734863]|
+------------------+------------+-------------------+
only showing top 5 rows



In [43]:
sc.stop()