In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("LinearRegression") \
.master("local[4]") \
.config("spark.driver.memory","2g") \
.config("spark.executor.memory","4g") \
.getOrCreate()

In [4]:
df = spark.read.format("csv") \
.option("header",True) \
.option("sep",",") \
.option("inferSchema",True) \
.load("C:/Users/Rulokat/Desktop/GitHub/apache-spark/Mllib-ApacheSpark/Advertising.csv")

In [5]:
df.toPandas().head()

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [6]:
df2 = df.withColumn("Advertisement", df.TV + df.Radio + df.Newspaper) \
.withColumnRenamed("Sales","label") \
.drop("TV","Radio","Newspaper")

df2.toPandas().head()

Unnamed: 0,_c0,label,Advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


###### For using simple linear regression, we need to work on one feature and check its impact on the label. So, I added the 3 feature and using them as one feature. 

###### Thats not true way, just for example. We will see, the impact is "meeh!"

In [12]:
# Discover the data.
df2.describe("label","Advertisement").toPandas().head()

Unnamed: 0,summary,label,Advertisement
0,count,200.0,200.0
1,mean,14.022500000000004,200.86049999999992
2,stddev,5.217456565710477,92.9851805869837
3,min,1.6,11.7
4,max,27.0,433.6


#### VectorAssembler

In [8]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler() \
.setInputCols(["Advertisement"]) \
.setOutputCol("features")

#### Regression Model

In [9]:
from pyspark.ml.regression import LinearRegression
linear_reg_obj = LinearRegression() \
.setLabelCol("label")\
.setFeaturesCol("features")

#### Pipeline

In [10]:
from pyspark.ml import Pipeline
pipeline_obj = Pipeline() \
.setStages([vector_assembler, linear_reg_obj])

#### Train test split

In [15]:
train_df, test_df = df2.randomSplit([0.8, 0.2], seed=42)

#### Train Model

In [16]:
pipeline_model = pipeline_obj.fit(train_df)

#### Test model

In [17]:
result_df = pipeline_model.transform(test_df)

In [18]:
result_df.toPandas().head()

Unnamed: 0,_c0,label,Advertisement,features,prediction
0,2,10.4,128.9,[128.9],10.509158
1,3,9.3,132.4,[132.39999999999998],10.67992
2,4,18.5,251.3,[251.3],16.480936
3,14,9.7,112.3,[112.3],9.69926
4,17,12.5,218.4,[218.4],14.875777


In [19]:
pipeline_model.stages

[VectorAssembler_4685b3b0e9beafd01861, LinearRegression_4534a2c93685610d70be]

In [20]:
lr_model = pipeline_model.stages[1]

In [21]:
lr_model.coefficients

DenseVector([0.0488])

In [22]:
lr_model.intercept

4.220252050433769

In [23]:
lr_model.summary.r2

0.759058125711146

In [24]:
lr_model.summary.pValues

[0.0, 6.661338147750939e-16]

In [25]:
lr_model.summary.tValues

[23.07409768933377, 8.94809127087217]

In [26]:
lr_model.summary.rootMeanSquaredError

2.5893098192488426

#### Here our simple linear regression model

# y = 4.220252050433769 + 0.0488 * Advertisement

In [27]:
# Lets predict sales where advertisement is 100.0
df_predict_rdd = spark.sparkContext.parallelize([100.0])
df_predict = df_predict_rdd.map(lambda x: (x,)).toDF(["Advertisement"])

In [28]:
df_pred_vec = vector_assembler.transform(df_predict)

In [29]:
lr_model.transform(df_pred_vec).toPandas().head()

Unnamed: 0,Advertisement,features,prediction
0,100.0,[100.0],9.099155
