# Linear Regression

In [1]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
import scipy.stats as st
import numpy as np
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

### Randomly Genrate the Data

In [2]:
# Randomly Genrate the Data
m=2
c=5

# Create pandas dataframe having 500 data points
num_point=500
x=np.linspace(start=0,stop=10,num=num_point)
y = m*x + c + st.norm().rvs(num_point)
df=pd.DataFrame({"x":x,"y":y})


In [3]:
# Create Spark session
spark = SparkSession.builder.appName("asheesh").getOrCreate()

# Convert pandas datafrmae to spark dataframe
df = spark.createDataFrame(df)

22/04/05 09:11:09 WARN Utils: Your hostname, a-Lenovo-Legion-Y530-15ICH resolves to a loopback address: 127.0.1.1; using 192.168.1.4 instead (on interface wlp7s0)
22/04/05 09:11:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/05 09:11:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/05 09:11:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/05 09:11:10 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Without Pipeline

In [4]:
# Create Vector
vectorAssembler = VectorAssembler(inputCols=["x"], outputCol="rawFeatures")
v_df = vectorAssembler.transform(df)
v_df = v_df.select(['rawFeatures', 'y'])
v_df.show()

+--------------------+------------------+
|         rawFeatures|                 y|
+--------------------+------------------+
|               [0.0]| 3.143243310075543|
|[0.02004008016032...|3.9351219271088858|
|[0.04008016032064...|5.1160815825151165|
|[0.06012024048096...| 4.556468499246165|
|[0.08016032064128...|5.8236679062038545|
|[0.1002004008016032]| 6.601993866649372|
|[0.12024048096192...| 5.915353255667452|
|[0.1402805611222445]| 4.690303476261306|
|[0.16032064128256...|6.6944377144739535|
|[0.18036072144288...| 4.875275466981254|
|[0.2004008016032064]| 4.550791169404153|
|[0.22044088176352...| 5.561599099942741|
|[0.24048096192384...|5.3541273971123315|
|[0.2605210420841683]|5.1884012925193534|
| [0.280561122244489]|3.7359781694187753|
|[0.3006012024048096]| 6.142590495383834|
|[0.32064128256513...|3.8532991444123104|
|[0.3406813627254509]|4.9704226332255885|
|[0.3607214428857715]|  5.98233622436208|
|[0.3807615230460922]| 6.754053972640565|
+--------------------+------------

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [5]:
# Split the data into training and testing
splits = v_df.randomSplit([0.7, 0.3])
train_df,test_df = splits[0],splits[1]

In [6]:
lr = LinearRegression(featuresCol="rawFeatures", labelCol="y",maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

22/04/05 09:11:18 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/04/05 09:11:18 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


Coefficients: [1.9158929086544758]
Intercept: 5.427339116827695


#### Save Model and intermediate transformations

In [7]:
lr_model.write().overwrite().save("lr_model")
! ls -lrt lr_model

total 8
drwxrwxr-x 2 a a 4096 Apr  5 09:11 metadata
drwxrwxr-x 2 a a 4096 Apr  5 09:11 data


In [8]:
# prediction
# lr_model.transform(df).show()

In [9]:
prediction_df=lr_model.transform(vectorAssembler.transform(df)).show()

+-------------------+------------------+--------------------+------------------+
|                  x|                 y|         rawFeatures|        prediction|
+-------------------+------------------+--------------------+------------------+
|                0.0| 3.143243310075543|               [0.0]| 5.427339116827695|
|0.02004008016032064|3.9351219271088858|[0.02004008016032...|  5.46573376429572|
|0.04008016032064128|5.1160815825151165|[0.04008016032064...| 5.504128411763746|
|0.06012024048096192| 4.556468499246165|[0.06012024048096...| 5.542523059231772|
|0.08016032064128256|5.8236679062038545|[0.08016032064128...| 5.580917706699797|
| 0.1002004008016032| 6.601993866649372|[0.1002004008016032]| 5.619312354167823|
|0.12024048096192384| 5.915353255667452|[0.12024048096192...| 5.657707001635848|
| 0.1402805611222445| 4.690303476261306|[0.1402805611222445]| 5.696101649103874|
|0.16032064128256512|6.6944377144739535|[0.16032064128256...|5.7344962965718995|
|0.18036072144288576| 4.8752

### With PipeLine

In [10]:
pipeline = Pipeline(stages=[vectorAssembler, lr])

In [11]:
# Split Data
splits = df.randomSplit([0.7, 0.3])
train_df,test_df = splits[0],splits[1]

In [12]:
# Fit the model
pipelineModel = pipeline.fit(train_df)

In [13]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(pipelineModel.stages[-1].coefficients))
print("Intercept: %s" % str(pipelineModel.stages[-1].intercept))

Coefficients: [1.909621963904685]
Intercept: 5.478512612791287


In [14]:
# prediction
lr_predictions = pipelineModel.transform(test_df)
lr_predictions.select("prediction","y","rawFeatures").show(5)

+-----------------+------------------+--------------------+
|       prediction|                 y|         rawFeatures|
+-----------------+------------------+--------------------+
|5.555050567256405|5.1160815825151165|[0.04008016032064...|
|5.746395453419199| 4.690303476261306|[0.1402805611222445]|
|5.861202385116876| 4.550791169404153|[0.2004008016032064]|
|6.090816248512228|3.8532991444123104|[0.32064128256513...|
|6.129085225744787|4.9704226332255885|[0.3406813627254509]|
+-----------------+------------------+--------------------+
only showing top 5 rows



In [15]:
# Save pipeline
pipelineModel.write().overwrite().save("lr_pipeline")
! ls -lrt lr_pipeline

total 8
drwxrwxr-x 2 a a 4096 Apr  5 09:11 metadata
drwxrwxr-x 4 a a 4096 Apr  5 09:11 stages
