In [1]:
import os 
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable 
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler #group dependent and independent features
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
conf_spark = SparkConf().set("spark.driver.host", "127.0.0.1")
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/25 10:50:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
file_location = "/Users/prateekpaul/Downloads/Housing.csv"
file_type = "csv"

df = spark.read.csv(file_location, header = True, inferSchema = True)
df.show()

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|
|12215000| 7500|       4|        2|      2|     yes|       no|     yes|             no|            yes|      3|     yes|       furnished|
|11410000| 7420|       4|        1

In [5]:
df.printSchema()  # dependent and independent feature

root
 |-- price: integer (nullable = true)
 |-- area: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- stories: integer (nullable = true)
 |-- mainroad: string (nullable = true)
 |-- guestroom: string (nullable = true)
 |-- basement: string (nullable = true)
 |-- hotwaterheating: string (nullable = true)
 |-- airconditioning: string (nullable = true)
 |-- parking: integer (nullable = true)
 |-- prefarea: string (nullable = true)
 |-- furnishingstatus: string (nullable = true)



In [6]:
df.columns

['price',
 'area',
 'bedrooms',
 'bathrooms',
 'stories',
 'mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'parking',
 'prefarea',
 'furnishingstatus']

In [7]:
## handling categorical features

from pyspark.ml.feature import StringIndexer

In [8]:
indexer = StringIndexer(inputCol="guestroom", outputCol = "guestroom_indexed")
df_lr = indexer.fit(df).transform(df)
df_lr.show()  #ordinal input values

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+-----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|guestroom_indexed|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+-----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|              0.0|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|              0.0|
|12250000| 9960|       3|        2|      2|     yes|       no|     yes|             no|             no|      2|     yes|  semi-furnished|              0.0|
|12215000| 7500|       4|        2|      2|     yes|       no|  

In [9]:
indexer = StringIndexer(inputCols=["basement", "parking", "prefarea"], outputCols = ["basement_indexed", "parking_indexed", "prefarea_indexed"])
df_lr = indexer.fit(df_lr).transform(df_lr)
df_lr.show() 

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+-----------------+----------------+---------------+----------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|guestroom_indexed|basement_indexed|parking_indexed|prefarea_indexed|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+-----------------+----------------+---------------+----------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|              0.0|             0.0|            2.0|             1.0|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|            yes|      3|      no|       furnished|              0.0|             0.0|    

In [10]:
df_lr.columns

['price',
 'area',
 'bedrooms',
 'bathrooms',
 'stories',
 'mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'parking',
 'prefarea',
 'furnishingstatus',
 'guestroom_indexed',
 'basement_indexed',
 'parking_indexed',
 'prefarea_indexed']

In [11]:
feature_assembler = VectorAssembler(inputCols= ['area', 'bedrooms', 'bathrooms', 'stories','guestroom_indexed','basement_indexed','parking_indexed','prefarea_indexed']
                , outputCol = "Independent Features")

output = feature_assembler.transform(df_lr)

In [12]:
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|[7420.0,4.0,2.0,3...|
|[8960.0,4.0,4.0,4...|
|[9960.0,3.0,2.0,2...|
|[7500.0,4.0,2.0,2...|
|[7420.0,4.0,1.0,2...|
|[7500.0,3.0,3.0,1...|
|[8580.0,4.0,3.0,4...|
|(8,[0,1,2,3],[162...|
|[8100.0,4.0,1.0,2...|
|[5750.0,3.0,2.0,4...|
|[13200.0,3.0,1.0,...|
|[6000.0,4.0,3.0,2...|
|[6550.0,4.0,2.0,2...|
|[3500.0,4.0,2.0,2...|
|[7800.0,3.0,2.0,2...|
|[6000.0,4.0,1.0,2...|
|[6600.0,4.0,2.0,2...|
|[8500.0,3.0,2.0,4...|
|[4600.0,3.0,2.0,2...|
|[6420.0,3.0,2.0,2...|
+--------------------+
only showing top 20 rows



In [13]:
output.show()

+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+-----------------+----------------+---------------+----------------+--------------------+
|   price| area|bedrooms|bathrooms|stories|mainroad|guestroom|basement|hotwaterheating|airconditioning|parking|prefarea|furnishingstatus|guestroom_indexed|basement_indexed|parking_indexed|prefarea_indexed|Independent Features|
+--------+-----+--------+---------+-------+--------+---------+--------+---------------+---------------+-------+--------+----------------+-----------------+----------------+---------------+----------------+--------------------+
|13300000| 7420|       4|        2|      3|     yes|       no|      no|             no|            yes|      2|     yes|       furnished|              0.0|             0.0|            2.0|             1.0|[7420.0,4.0,2.0,3...|
|12250000| 8960|       4|        4|      4|     yes|       no|      no|             no|     

In [14]:
finalized_data = output.select("Independent Features", "price")

In [15]:
finalized_data.show()

+--------------------+--------+
|Independent Features|   price|
+--------------------+--------+
|[7420.0,4.0,2.0,3...|13300000|
|[8960.0,4.0,4.0,4...|12250000|
|[9960.0,3.0,2.0,2...|12250000|
|[7500.0,4.0,2.0,2...|12215000|
|[7420.0,4.0,1.0,2...|11410000|
|[7500.0,3.0,3.0,1...|10850000|
|[8580.0,4.0,3.0,4...|10150000|
|(8,[0,1,2,3],[162...|10150000|
|[8100.0,4.0,1.0,2...| 9870000|
|[5750.0,3.0,2.0,4...| 9800000|
|[13200.0,3.0,1.0,...| 9800000|
|[6000.0,4.0,3.0,2...| 9681000|
|[6550.0,4.0,2.0,2...| 9310000|
|[3500.0,4.0,2.0,2...| 9240000|
|[7800.0,3.0,2.0,2...| 9240000|
|[6000.0,4.0,1.0,2...| 9100000|
|[6600.0,4.0,2.0,2...| 9100000|
|[8500.0,3.0,2.0,4...| 8960000|
|[4600.0,3.0,2.0,2...| 8890000|
|[6420.0,3.0,2.0,2...| 8855000|
+--------------------+--------+
only showing top 20 rows



In [16]:
## train test split

train_data, test_data = finalized_data.randomSplit([0.7, 0.3], seed=42)
lr = LinearRegression(featuresCol='Independent Features', labelCol = 'price')
regressor_model = lr.fit(train_data)

25/02/25 10:50:09 WARN Instrumentation: [e47bed90] regParam is zero, which might cause numerical instability and overfitting.
25/02/25 10:50:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/02/25 10:50:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [17]:
regressor_model.coefficients

DenseVector([288.9593, 94722.2292, 1017496.5858, 601078.4562, 266473.1044, 423340.1135, 305785.0169, 742597.1194])

In [18]:
regressor_model.intercept

-41576.470027862924

In [19]:
### predictions

pred_result = regressor_model.evaluate(test_data)

In [20]:
## final comparison

pred_result.predictions.show()

+--------------------+-------+------------------+
|Independent Features|  price|        prediction|
+--------------------+-------+------------------+
|(8,[0,1,2,3],[200...|2660000| 2945440.094615579|
|(8,[0,1,2,3],[240...|2100000|3155746.0454283934|
|(8,[0,1,2,3],[247...|2380000|3177417.9932361143|
|(8,[0,1,2,3],[250...|3290000|2488841.2905145655|
|(8,[0,1,2,3],[274...|4200000| 4368233.738913052|
|(8,[0,1,2,3],[278...|2380000|  4379792.11107717|
|(8,[0,1,2,3],[280...|2660000| 2670251.310917085|
|(8,[0,1,2,3],[291...|1750000|2702036.8343684096|
|(8,[0,1,2,3],[300...|2135000|  2633320.94256604|
|(8,[0,1,2,3],[300...|2961000|3234399.3987185275|
|(8,[0,1,2,3],[300...|3640000|3234399.3987185275|
|(8,[0,1,2,3],[300...|2100000|3423843.8570617973|
|(8,[0,1,2,3],[304...|2870000| 2644879.314730158|
|(8,[0,1,2,3],[306...|3465000| 2745380.729983852|
|(8,[0,1,2,3],[312...|3220000|3363796.7443825165|
|(8,[0,1,2,3],[318...|2310000| 2685333.617304571|
|(8,[0,1,2,3],[318...|2520000|2780055.8464762056|


In [21]:
pred_result.r2 # performance matrix

0.7096604317074737

In [22]:
pred_result.meanAbsoluteError , pred_result.meanSquaredError

(779453.240768702, 1205323569215.0828)