<a href="https://colab.research.google.com/github/abelsare348/Building-RAG-app-on-Datasets/blob/main/Linear_Regression/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("ML-ops").master("local").getOrCreate()

In [4]:
housingdf=spark.read.option("inferSchema",True).option("header",True).csv("/content/housing.csv")
housingdf.show()

+-----------+--------+---------+---+------+
|square_feet|bedrooms|bathrooms|age| price|
+-----------+--------+---------+---+------+
|       1000|       2|        1| 10|200000|
|       1500|       3|        2|  5|300000|
|       2000|       4|        3|  2|450000|
|       2500|       4|        2|  8|500000|
|       1200|       3|        1| 20|220000|
|       1800|       3|        2|  4|350000|
+-----------+--------+---------+---+------+



In [6]:
# Define input columns into one
feature_cols=["square_feet","bedrooms","bathrooms","age"]

In [8]:
from pyspark.ml.feature import VectorAssembler

# create assembler to feed to spark ml model in input and output form
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [13]:
data= assembler.transform(housingdf).select("features","price")

In [14]:
data.show()

+--------------------+------+
|            features| price|
+--------------------+------+
|[1000.0,2.0,1.0,1...|200000|
|[1500.0,3.0,2.0,5.0]|300000|
|[2000.0,4.0,3.0,2.0]|450000|
|[2500.0,4.0,2.0,8.0]|500000|
|[1200.0,3.0,1.0,2...|220000|
|[1800.0,3.0,2.0,4.0]|350000|
+--------------------+------+



In [16]:
# divide data for testing and training
train_data,test_data=data.randomSplit([0.8,0.2],seed=42)

In [18]:
from pyspark.ml.regression import LinearRegression

# build linear regression model on train_data
model= LinearRegression(featuresCol="features",labelCol="price").fit(train_data)

In [21]:
print("intercept" ,model.intercept)
print("coefficient" ,model.coefficients)

intercept 228749.9999999147
coefficient [-137.49999999989578,378749.99999987386,-317499.99999988085,-33124.99999998893]


In [23]:
# Evaluate on test data
predictions= model.transform(test_data)
predictions.select("features","price","prediction").show()

+--------------------+------+-----------------+
|            features| price|       prediction|
+--------------------+------+-----------------+
|[1500.0,3.0,2.0,5.0]|300000|358124.9999999863|
+--------------------+------+-----------------+



In [24]:
# performance summary

summary = model.summary
print("RMSE: ", summary.rootMeanSquaredError)
print("R2: ", summary.r2)

RMSE:  3.729607896255331e-09
R2:  1.0
