In [1]:
from matplotlib import pyplot as plt

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
import pandas as pd

In [4]:
l_file = pd.read_csv('../../datasets/iris.csv',sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])

In [5]:
sc = pyspark.SparkContext(appName="simple_linear_reg")

In [6]:
from pyspark.sql.types import *
from datetime import datetime

In [7]:
schema = StructType([StructField("sepal_length", FloatType(), True),StructField("sepal_width", FloatType(), True),
                    StructField("petal_length", FloatType(), True),StructField("petal_width", FloatType(), True),
                     StructField("target", StringType(), True)])

In [8]:
from pyspark.sql import SQLContext

In [9]:
sqlctx = SQLContext(sc)

In [10]:
my_rdd = sqlctx.createDataFrame(l_file,schema)

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(
  inputCols=["sepal_length"],
  outputCol="features"
)

In [13]:
sepal_petal_model = my_rdd.select("sepal_length", "petal_length").dropna()

In [14]:
model_assembled = assembler.transform(sepal_petal_model)

In [15]:
(train, test) = model_assembled.randomSplit([0.7, 0.3])

In [16]:
from pyspark.ml.regression import LinearRegression

In [17]:
lr = LinearRegression(
  featuresCol="features",
  labelCol="petal_length",
  fitIntercept=True
)

In [18]:
lr_model = lr.fit(train)

In [19]:
lr_summary = lr_model.evaluate(test)

In [20]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))


Coefficients: [1.8598067977772306]
Intercept: -7.127047744726532


In [21]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.882208
r2: 0.757335


In [22]:
train.describe().show()

+-------+------------------+------------------+
|summary|      sepal_length|      petal_length|
+-------+------------------+------------------+
|  count|               102|               102|
|   mean| 5.842156854330325|3.7382352866378485|
| stddev|0.8421373278710862|1.7997273720627796|
|    min|               4.3|               1.0|
|    max|               7.7|               6.9|
+-------+------------------+------------------+



In [23]:
lr_preds = lr_model.transform(test)
lr_preds.select("prediction","petal_length","features").show(5)

+------------------+------------+-------------------+
|        prediction|petal_length|           features|
+------------------+------------+-------------------+
|1.0561023428582796|         1.3|[4.400000095367432]|
| 1.242082845271005|         1.3|              [4.5]|
|1.4280633476837306|         1.4|[4.599999904632568]|
|  1.80002523933417|         1.4|[4.800000190734863]|
|  1.80002523933417|         1.9|[4.800000190734863]|
+------------------+------------+-------------------+
only showing top 5 rows



In [24]:
sc.stop()