In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Housing DataFrame") \
    .getOrCreate()

# Read the CSV file into a DataFrame
housing_df = spark.read.option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/user/student/housing.csv")

# Inspect the schema of the DataFrame
housing_df.printSchema()


                                                                                

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: string (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)



In [3]:
housing_df.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|null|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

In [8]:
housing_df.na.drop()

DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: string, NOX: double, RM: double, AGE: double, DIS: double, RAD: int, TAX: int, PTRATIO: double, B: double, LSTAT: double, MEDV: double]

In [9]:
# Get the list of all columns
all_columns = housing_df.columns

# Create a list excluding the 'MEDV' column
feature_columns = [col for col in all_columns if col != "MEDV"]

# Print the list of feature columns
print(feature_columns)


['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [14]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql import SparkSession


assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")




In [15]:
assembled_df = assembler.transform(housing_df_indexed)


assembled_df.select("features").show(truncate=False)

+----------------------------------------------------------------------------+
|features                                                                    |
+----------------------------------------------------------------------------+
|[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]     |
|[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]    |
|[0.02729,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]    |
|[0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94]   |
|[0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33]    |
|[0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21]    |
|[0.08829,12.5,7.87,1.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43]  |
|[0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15]  |
|[0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93]|
|[0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,

In [17]:

weights = [0.7, 0.3] 
train, test = assembled_df.randomSplit(weights) 


In [18]:
from pyspark.ml.regression import LinearRegression

# 3.1 Create a Linear Regression Estimator
lr = LinearRegression(featuresCol='features', labelCol='MEDV')

# 3.2 Fit the model on the training set
lr_model = lr.fit(train)


2024-10-01 05:02:34,337 WARN util.Instrumentation: [63e6714a] regParam is zero, which might cause numerical instability and overfitting.
2024-10-01 05:02:34,899 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2024-10-01 05:02:34,900 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2024-10-01 05:02:34,973 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2024-10-01 05:02:34,973 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
2024-10-01 05:02:35,048 WARN util.Instrumentation: [63e6714a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
2024-10-01 05:02:36,342 ERROR optimize.LBFGS: Failure! Resetting history: breeze.optimize.FirstOrderException: Line search zoom failed
2024-10-01 05:02:36,345 ERROR optimize.LBFGS: Failure! Resetting history: breeze.optimize.F

In [20]:

predictions = lr_model.transform(test)

print("Predictions DataFrame:")
predictions.select('features', 'MEDV', 'prediction').show(5, truncate=False)


Predictions DataFrame:
+--------------------------------------------------------------------------+----+------------------+
|features                                                                  |MEDV|prediction        |
+--------------------------------------------------------------------------+----+------------------+
|[0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]   |24.0|4.207161825791275 |
|[0.02729,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]  |21.6|21.60075314695348 |
|[0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94] |33.4|35.34298822791544 |
|[0.08829,12.5,7.87,1.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43]|22.9|5.856243806697023 |
|[0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15]|27.1|18.937742182308227|
+--------------------------------------------------------------------------+----+------------------+
only showing top 5 rows



In [22]:

predictions = lr_model.transform(test)


print("Predictions DataFrame:")
predictions.select('CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV', 'prediction').show(5, truncate=False)


Predictions DataFrame:
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+------------------+
|CRIM   |ZN  |INDUS|CHAS|NOX  |RM   |AGE |DIS   |RAD|TAX|PTRATIO|B     |LSTAT|MEDV|prediction        |
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+------------------+
|0.00632|18.0|2.31 |0   |0.538|6.575|65.2|4.09  |1  |296|15.3   |396.9 |4.98 |24.0|4.207161825791275 |
|0.02729|0.0 |7.07 |0   |0.469|6.421|78.9|4.9671|2  |242|17.8   |396.9 |9.14 |21.6|21.60075314695348 |
|0.03237|0.0 |2.18 |0   |0.458|6.998|45.8|6.0622|3  |222|18.7   |394.63|2.94 |33.4|35.34298822791544 |
|0.08829|12.5|7.87 |null|0.524|6.012|66.6|5.5605|5  |311|15.2   |395.6 |12.43|22.9|5.856243806697023 |
|0.14455|12.5|7.87 |0   |0.524|6.172|96.1|5.9505|5  |311|15.2   |396.9 |19.15|27.1|18.937742182308227|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+------------------+
only showing top 5 rows

