In [0]:
#final delivery 2
#https://www.kaggle.com/datasets/georgejnr/used-and-new-cars-datasets

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [0]:
spark=SparkSession.builder.appName('Used-Car-Price').getOrCreate()

In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/yanjwang222@gmail.com/car_data.csv")
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Mileage: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- MSRP: string (nullable = true)



In [0]:
columns = df.columns

for col in columns:
    null_count = df.filter(df[col].isNull()).count()
    print(f"Column '{col}' has {null_count} null values.")
    
total_null_count = df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).first()
total_null_count_dict = total_null_count.asDict()

print(f"Total null values in the dataframe: {total_null_count}")


Column '_c0' has 0 null values.
Column 'Model' has 0 null values.
Column 'Year' has 0 null values.
Column 'Status' has 0 null values.
Column 'Mileage' has 0 null values.
Column 'Price' has 0 null values.
Column 'MSRP' has 0 null values.
Total null values in the dataframe: Row(_c0=0, Model=0, Year=0, Status=0, Mileage=0, Price=0, MSRP=0)


In [0]:
df.show()

+---+--------------------+----+------+-------------+-------+---------------+
|_c0|               Model|Year|Status|      Mileage|  Price|           MSRP|
+---+--------------------+----+------+-------------+-------+---------------+
|  0|2022 Acura TLX A-...|2022|   New|Not available|$49,445|   MSRP $49,445|
|  1|2023 Acura RDX A-...|2023|   New|Not available|$50,895|  Not specified|
|  2|2023 Acura TLX Ty...|2023|   New|Not available|$57,745|  Not specified|
|  3|2023 Acura TLX Ty...|2023|   New|Not available|$57,545|  Not specified|
|  4|2019 Acura MDX Sp...|2019|  Used|   32,675 mi.|$40,990|$600 price drop|
|  5|2023 Acura TLX A-...|2023|   New|Not available|$50,195|   MSRP $50,195|
|  6|2023 Acura TLX A-...|2023|   New|Not available|$50,195|   MSRP $50,195|
|  7|2023 Acura TLX Ty...|2023|   New|Not available|$57,745|  Not specified|
|  8|2023 Acura TLX A-...|2023|   New|Not available|$47,995|  Not specified|
|  9|2022 Acura TLX A-...|2022|   New|Not available|$49,545|  Not specified|

In [0]:
df = df.drop('_c0')
df.show(10)

+--------------------+----+------+-------------+-------+---------------+
|               Model|Year|Status|      Mileage|  Price|           MSRP|
+--------------------+----+------+-------------+-------+---------------+
|2022 Acura TLX A-...|2022|   New|Not available|$49,445|   MSRP $49,445|
|2023 Acura RDX A-...|2023|   New|Not available|$50,895|  Not specified|
|2023 Acura TLX Ty...|2023|   New|Not available|$57,745|  Not specified|
|2023 Acura TLX Ty...|2023|   New|Not available|$57,545|  Not specified|
|2019 Acura MDX Sp...|2019|  Used|   32,675 mi.|$40,990|$600 price drop|
|2023 Acura TLX A-...|2023|   New|Not available|$50,195|   MSRP $50,195|
|2023 Acura TLX A-...|2023|   New|Not available|$50,195|   MSRP $50,195|
|2023 Acura TLX Ty...|2023|   New|Not available|$57,745|  Not specified|
|2023 Acura TLX A-...|2023|   New|Not available|$47,995|  Not specified|
|2022 Acura TLX A-...|2022|   New|Not available|$49,545|  Not specified|
+--------------------+----+------+-------------+---

In [0]:
element_count = df.groupBy('Mileage').count().orderBy('count', ascending=False)
element_count.show(7)

+-------------+-----+
|      Mileage|count|
+-------------+-----+
|Not available|47868|
|      310 mi.|  101|
|   23,000 mi.|   19|
|    1,000 mi.|   18|
|   29,000 mi.|   17|
|   35,000 mi.|   17|
|   26,000 mi.|   13|
+-------------+-----+
only showing top 7 rows



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import when, count, col

In [0]:
df = df.withColumn('Mileage', when(col('Mileage') == 'Not available', '0 mi.').otherwise(col('Mileage')))
df = df.withColumn('Mileage', regexp_replace(col('Mileage'), ' mi.', ''))
df = df.withColumn('Mileage', regexp_replace(col('Mileage'), ',', ''))
df.show()


+--------------------+----+------+-------+-------+---------------+
|               Model|Year|Status|Mileage|  Price|           MSRP|
+--------------------+----+------+-------+-------+---------------+
|2022 Acura TLX A-...|2022|   New|      0|$49,445|   MSRP $49,445|
|2023 Acura RDX A-...|2023|   New|      0|$50,895|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|$57,745|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|$57,545|  Not specified|
|2019 Acura MDX Sp...|2019|  Used|  32675|$40,990|$600 price drop|
|2023 Acura TLX A-...|2023|   New|      0|$50,195|   MSRP $50,195|
|2023 Acura TLX A-...|2023|   New|      0|$50,195|   MSRP $50,195|
|2023 Acura TLX Ty...|2023|   New|      0|$57,745|  Not specified|
|2023 Acura TLX A-...|2023|   New|      0|$47,995|  Not specified|
|2022 Acura TLX A-...|2022|   New|      0|$49,545|  Not specified|
|2023 Acura Integr...|2023|   New|      0|$36,895|   MSRP $36,895|
|2023 Acura TLX A-...|2023|   New|      0|$48,395|   MSRP $48,

In [0]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType

df = df.withColumn("Mileage", when(col("Mileage").rlike("^[0-9]+$"), col("Mileage")).otherwise("0"))
df = df.withColumn("Mileage", col("Mileage").cast(IntegerType()))
df.printSchema()
df.show()

root
 |-- Model: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Price: string (nullable = true)
 |-- MSRP: string (nullable = true)

+--------------------+----+------+-------+-------+---------------+
|               Model|Year|Status|Mileage|  Price|           MSRP|
+--------------------+----+------+-------+-------+---------------+
|2022 Acura TLX A-...|2022|   New|      0|$49,445|   MSRP $49,445|
|2023 Acura RDX A-...|2023|   New|      0|$50,895|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|$57,745|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|$57,545|  Not specified|
|2019 Acura MDX Sp...|2019|  Used|  32675|$40,990|$600 price drop|
|2023 Acura TLX A-...|2023|   New|      0|$50,195|   MSRP $50,195|
|2023 Acura TLX A-...|2023|   New|      0|$50,195|   MSRP $50,195|
|2023 Acura TLX Ty...|2023|   New|      0|$57,745|  Not specified|
|2023 Acura TLX A-...|2023|   New

In [0]:
element_count_price = df.groupBy('Price').count().orderBy('count', ascending=False)
element_count_price.show(6)
price_count=df.select(col('Price')).count()
print('There are',price_count,'valid data in total')

+----------+-----+
|     Price|count|
+----------+-----+
|Not Priced|  652|
|   $29,995|  241|
|   $34,995|  226|
|   $72,010|  200|
|   $39,995|  192|
|   $24,995|  183|
+----------+-----+
only showing top 6 rows

There are 115762 valid data in total


In [0]:
#Since the proportion of not price data in the whole data is too small,  drop the not price data
df = df.filter(df.Price !='Not Priced')
df.select(col('Price')).count()


Out[13]: 115110

In [0]:
df.count()

Out[14]: 115110

In [0]:
len(df.columns)

Out[15]: 6

In [0]:
df = df.withColumn('Price', regexp_replace(col('Price'), ',', ''))
df.show()

+--------------------+----+------+-------+------+---------------+
|               Model|Year|Status|Mileage| Price|           MSRP|
+--------------------+----+------+-------+------+---------------+
|2022 Acura TLX A-...|2022|   New|      0|$49445|   MSRP $49,445|
|2023 Acura RDX A-...|2023|   New|      0|$50895|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|$57745|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|$57545|  Not specified|
|2019 Acura MDX Sp...|2019|  Used|  32675|$40990|$600 price drop|
|2023 Acura TLX A-...|2023|   New|      0|$50195|   MSRP $50,195|
|2023 Acura TLX A-...|2023|   New|      0|$50195|   MSRP $50,195|
|2023 Acura TLX Ty...|2023|   New|      0|$57745|  Not specified|
|2023 Acura TLX A-...|2023|   New|      0|$47995|  Not specified|
|2022 Acura TLX A-...|2022|   New|      0|$49545|  Not specified|
|2023 Acura Integr...|2023|   New|      0|$36895|   MSRP $36,895|
|2023 Acura TLX A-...|2023|   New|      0|$48395|   MSRP $48,395|
|2023 Acur

In [0]:
from pyspark.sql.functions import col, regexp_replace
df = df.withColumn('price', regexp_replace(col('price'), '[$€£¥]', ''))

df.printSchema()
df.show(5)

root
 |-- Model: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- price: string (nullable = true)
 |-- MSRP: string (nullable = true)

+--------------------+----+------+-------+-----+---------------+
|               Model|Year|Status|Mileage|price|           MSRP|
+--------------------+----+------+-------+-----+---------------+
|2022 Acura TLX A-...|2022|   New|      0|49445|   MSRP $49,445|
|2023 Acura RDX A-...|2023|   New|      0|50895|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|57545|  Not specified|
|2019 Acura MDX Sp...|2019|  Used|  32675|40990|$600 price drop|
+--------------------+----+------+-------+-----+---------------+
only showing top 5 rows



In [0]:
df = df.withColumn('price', col('price').cast('integer'))
df.printSchema()
df.show(5)

root
 |-- Model: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- MSRP: string (nullable = true)

+--------------------+----+------+-------+-----+---------------+
|               Model|Year|Status|Mileage|price|           MSRP|
+--------------------+----+------+-------+-----+---------------+
|2022 Acura TLX A-...|2022|   New|      0|49445|   MSRP $49,445|
|2023 Acura RDX A-...|2023|   New|      0|50895|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|
|2023 Acura TLX Ty...|2023|   New|      0|57545|  Not specified|
|2019 Acura MDX Sp...|2019|  Used|  32675|40990|$600 price drop|
+--------------------+----+------+-------+-----+---------------+
only showing top 5 rows



In [0]:
element_count_Status = df.groupBy('Status').count().orderBy('Status', ascending=False)
element_count_Status.show()


+--------------------+-----+
|              Status|count|
+--------------------+-----+
|Volkswagen Certified|  797|
|                Used|61627|
|    Toyota Certified|  183|
|   Porsche Certified| 1914|
|                 New|47434|
|  INFINITI Certified|  865|
|      Ford Certified|   29|
|     Dodge Certified|  388|
| Chevrolet Certified|  211|
|       BMW Certified|  609|
|     Acura Certified| 1053|
+--------------------+-----+



In [0]:
from pyspark.sql.functions import when, col
df = df.withColumn('Status', when(col('Status').isin(['Used', 'New']), col('Status')).otherwise('Certified'))

element_count_Status = df.groupBy('Status').count().orderBy('Status', ascending=False)
element_count_Status.show()

+---------+-----+
|   Status|count|
+---------+-----+
|     Used|61627|
|      New|47434|
|Certified| 6049|
+---------+-----+



In [0]:
from pyspark.sql.functions import split
df = df.withColumn('Brand', split(df['Model'], ' ').getItem(1))
df.show()

+--------------------+----+------+-------+-----+---------------+-----+
|               Model|Year|Status|Mileage|price|           MSRP|Brand|
+--------------------+----+------+-------+-----+---------------+-----+
|2022 Acura TLX A-...|2022|   New|      0|49445|   MSRP $49,445|Acura|
|2023 Acura RDX A-...|2023|   New|      0|50895|  Not specified|Acura|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|Acura|
|2023 Acura TLX Ty...|2023|   New|      0|57545|  Not specified|Acura|
|2019 Acura MDX Sp...|2019|  Used|  32675|40990|$600 price drop|Acura|
|2023 Acura TLX A-...|2023|   New|      0|50195|   MSRP $50,195|Acura|
|2023 Acura TLX A-...|2023|   New|      0|50195|   MSRP $50,195|Acura|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|Acura|
|2023 Acura TLX A-...|2023|   New|      0|47995|  Not specified|Acura|
|2022 Acura TLX A-...|2022|   New|      0|49545|  Not specified|Acura|
|2023 Acura Integr...|2023|   New|      0|36895|   MSRP $36,895|Acura|
|2023 

In [0]:
element_count_Brand = df.groupBy('Brand').count().orderBy('Brand', ascending=False)
element_count_Brand.show()

+-------------+-----+
|        Brand|count|
+-------------+-----+
|   Volkswagen| 9968|
|       Toyota| 5709|
|        Tesla| 9068|
|      Porsche| 9961|
|Mercedes-Benz|10100|
|        Lexus| 9965|
|     INFINITI| 8664|
|      Hyundai| 8280|
|         Ford| 4465|
|        Dodge| 9819|
|    Chevrolet| 9914|
|          BMW| 9827|
|        Acura| 9370|
+-------------+-----+



In [0]:
df.select('Mileage','price').describe().show()

+-------+-----------------+-----------------+
|summary|          Mileage|            price|
+-------+-----------------+-----------------+
|  count|           115110|           115110|
|   mean| 28279.3212579272|51517.98868908001|
| stddev|38055.55058284502|37931.87068346887|
|    min|                0|             1800|
|    max|           974302|          2499900|
+-------+-----------------+-----------------+



In [0]:
quantiles = {
    c: dict(
        zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0))
    )
    for c in ["price"]
}
quantiles

Out[133]: {'price': {'q1': 29980.0, 'q3': 61280.0}}

In [0]:
for i in quantiles:
    iqr = quantiles[i]['q3'] - quantiles[i]['q1']
    quantiles[i]['lower_bound'] = quantiles[i]['q1'] - (iqr * 1.5)
    quantiles[i]['upper_bound'] = quantiles[i]['q3'] + (iqr * 1.5)
print(quantiles)

{'price': {'q1': 29980.0, 'q3': 61280.0, 'lower_bound': -16970.0, 'upper_bound': 108230.0}}


In [0]:
import pyspark.sql.functions as f
df_clean=df.select(
    "*",
    *[
        f.when(
            f.col(c).between(quantiles[c]['lower_bound'], quantiles[c]['upper_bound']),
            0
        ).otherwise(1).alias(c+"_out") 
        for c in ["price"]
    ]
)
df_clean.show(10)

+--------------------+----+------+-------+-----+---------------+-----+---------+
|               Model|Year|Status|Mileage|price|           MSRP|Brand|price_out|
+--------------------+----+------+-------+-----+---------------+-----+---------+
|2022 Acura TLX A-...|2022|   New|      0|49445|   MSRP $49,445|Acura|        0|
|2023 Acura RDX A-...|2023|   New|      0|50895|  Not specified|Acura|        0|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|Acura|        0|
|2023 Acura TLX Ty...|2023|   New|      0|57545|  Not specified|Acura|        0|
|2019 Acura MDX Sp...|2019|  Used|  32675|40990|$600 price drop|Acura|        0|
|2023 Acura TLX A-...|2023|   New|      0|50195|   MSRP $50,195|Acura|        0|
|2023 Acura TLX A-...|2023|   New|      0|50195|   MSRP $50,195|Acura|        0|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|Acura|        0|
|2023 Acura TLX A-...|2023|   New|      0|47995|  Not specified|Acura|        0|
|2022 Acura TLX A-...|2022| 

In [0]:
from pyspark.sql.functions import col
df_clean=df_clean.withColumn("outliers", col("price_out"))
df_clean.show()

+--------------------+----+------+-------+-----+---------------+-----+---------+--------+
|               Model|Year|Status|Mileage|price|           MSRP|Brand|price_out|outliers|
+--------------------+----+------+-------+-----+---------------+-----+---------+--------+
|2022 Acura TLX A-...|2022|   New|      0|49445|   MSRP $49,445|Acura|        0|       0|
|2023 Acura RDX A-...|2023|   New|      0|50895|  Not specified|Acura|        0|       0|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|Acura|        0|       0|
|2023 Acura TLX Ty...|2023|   New|      0|57545|  Not specified|Acura|        0|       0|
|2019 Acura MDX Sp...|2019|  Used|  32675|40990|$600 price drop|Acura|        0|       0|
|2023 Acura TLX A-...|2023|   New|      0|50195|   MSRP $50,195|Acura|        0|       0|
|2023 Acura TLX A-...|2023|   New|      0|50195|   MSRP $50,195|Acura|        0|       0|
|2023 Acura TLX Ty...|2023|   New|      0|57745|  Not specified|Acura|        0|       0|
|2023 Acur

In [0]:
# dropping outliers
df_clean = df_clean.filter((df_clean.outliers == 0) )
df_clean=df_clean.select(["Model","Year", "Status", "Mileage","price","MSRP", "Brand"])
df.select('price','Mileage').describe().show()

+-------+-----------------+-----------------+
|summary|            price|          Mileage|
+-------+-----------------+-----------------+
|  count|           115110|           115110|
|   mean|51517.98868908001| 28279.3212579272|
| stddev|37931.87068346887|38055.55058284502|
|    min|             1800|                0|
|    max|          2499900|           974302|
+-------+-----------------+-----------------+



In [0]:
import numpy as np
print("proportion of the lost Rows: ",np.round((df.count()-df_clean.count())/df.count(),4))

proportion of the lost Rows:  0.0712


In [0]:
df_clean.registerTempTable("dataclean")
display(sqlContext.sql("select * from dataclean"))



Model,Year,Status,Mileage,price,MSRP,Brand
2022 Acura TLX A-Spec,2022,New,0,49445,"MSRP $49,445",Acura
2023 Acura RDX A-Spec,2023,New,0,50895,Not specified,Acura
2023 Acura TLX Type S,2023,New,0,57745,Not specified,Acura
2023 Acura TLX Type S,2023,New,0,57545,Not specified,Acura
2019 Acura MDX Sport Hybrid 3.0L w/Technology Package,2019,Used,32675,40990,$600 price drop,Acura
2023 Acura TLX A-Spec,2023,New,0,50195,"MSRP $50,195",Acura
2023 Acura TLX A-Spec,2023,New,0,50195,"MSRP $50,195",Acura
2023 Acura TLX Type S,2023,New,0,57745,Not specified,Acura
2023 Acura TLX A-Spec,2023,New,0,47995,Not specified,Acura
2022 Acura TLX A-Spec,2022,New,0,49545,Not specified,Acura


Output can only be rendered in Databricks

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [0]:
#Explore the relationship between vehicle age and mileage by first processing the data. Create a list of ages
df_clean = df_clean.withColumn("Status", when(df.Status == "new", 0).otherwise(1))
df_clean = df_clean.withColumn("age", lit(2023) - year("Year"))
df_clean.show()

+--------------------+----+------+-------+-----+---------------+-----+---+
|               Model|Year|Status|Mileage|price|           MSRP|Brand|age|
+--------------------+----+------+-------+-----+---------------+-----+---+
|2022 Acura TLX A-...|2022|     1|      0|49445|   MSRP $49,445|Acura|  1|
|2023 Acura RDX A-...|2023|     1|      0|50895|  Not specified|Acura|  0|
|2023 Acura TLX Ty...|2023|     1|      0|57745|  Not specified|Acura|  0|
|2023 Acura TLX Ty...|2023|     1|      0|57545|  Not specified|Acura|  0|
|2019 Acura MDX Sp...|2019|     1|  32675|40990|$600 price drop|Acura|  4|
|2023 Acura TLX A-...|2023|     1|      0|50195|   MSRP $50,195|Acura|  0|
|2023 Acura TLX A-...|2023|     1|      0|50195|   MSRP $50,195|Acura|  0|
|2023 Acura TLX Ty...|2023|     1|      0|57745|  Not specified|Acura|  0|
|2023 Acura TLX A-...|2023|     1|      0|47995|  Not specified|Acura|  0|
|2022 Acura TLX A-...|2022|     1|      0|49545|  Not specified|Acura|  1|
|2023 Acura Integr...|202

In [0]:
# Transform the features to vectors.
assembler = VectorAssembler(inputCols=["age", "Mileage", "Status"], outputCol="features")
df_clean = assembler.transform(df_clean)


In [0]:
#The data presentation shows that each vehicle has a vector of selected feature columns. It is used to bring in to build the model.
df_clean.show(10)

+--------------------+----+------+-------+-----+---------------+-----+---+-----------------+
|               Model|Year|Status|Mileage|price|           MSRP|Brand|age|         features|
+--------------------+----+------+-------+-----+---------------+-----+---+-----------------+
|2022 Acura TLX A-...|2022|     1|      0|49445|   MSRP $49,445|Acura|  1|    [1.0,0.0,1.0]|
|2023 Acura RDX A-...|2023|     1|      0|50895|  Not specified|Acura|  0|    [0.0,0.0,1.0]|
|2023 Acura TLX Ty...|2023|     1|      0|57745|  Not specified|Acura|  0|    [0.0,0.0,1.0]|
|2023 Acura TLX Ty...|2023|     1|      0|57545|  Not specified|Acura|  0|    [0.0,0.0,1.0]|
|2019 Acura MDX Sp...|2019|     1|  32675|40990|$600 price drop|Acura|  4|[4.0,32675.0,1.0]|
|2023 Acura TLX A-...|2023|     1|      0|50195|   MSRP $50,195|Acura|  0|    [0.0,0.0,1.0]|
|2023 Acura TLX A-...|2023|     1|      0|50195|   MSRP $50,195|Acura|  0|    [0.0,0.0,1.0]|
|2023 Acura TLX Ty...|2023|     1|      0|57745|  Not specified|Acura|

In [0]:
train_data,test_data = df_clean.randomSplit([0.7,0.3])

In [0]:
#Creating multiple regression models
lr = LinearRegression(featuresCol= 'features',labelCol = "price",maxIter=10, regParam=0.1, elasticNetParam=0.8)
model = lr.fit(train_data)

In [0]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(model.coefficients,model.intercept))

Coefficients: [-297.70634026134604,-0.29735411344423335,0.0] Intercept: 54108.69781631584


+ The coefficient of "status" is 0.
+ The 'status' feature is not significantly contributing to the prediction of car price in the model.

In [0]:
predictions = model.transform(test_data)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 16416.4


In [0]:
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
r2_score = evaluator.evaluate(predictions)

print("R2 score on test data: {:.3f}".format(r2_score))

R2 score on test data: 0.363


In [0]:
from pyspark.ml.feature import VectorAssembler, PolynomialExpansion
# Then, let's apply a polynomial expansion to create polynomial features
poly_expansion = PolynomialExpansion(inputCol="features", outputCol="poly_features", degree=2)
df_expanded = poly_expansion.transform(df_clean)

In [0]:
train_data, test_data = df_expanded.randomSplit([0.7, 0.3], seed=1234)

# Define the linear regression model
lr = LinearRegression(featuresCol="poly_features", labelCol="price", maxIter=10, regParam=0.1)

# Fit the model to the training data
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model using R-squared
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="r2")
r2_score = evaluator.evaluate(predictions)

# Print the R-squared score
print("R-squared score on test data: {:.3f}".format(r2_score))

R-squared score on test data: 0.387


The results of the linear h regression model were not satisfactory, and this model explained 38% of the variability in the dependent variable. In other words, the model did not fit very well and 62% of the variability in the dependent variable was not explained by the model. there was no linear relationship between Status and the dependent variable

We continued with other machine learning models to train the fitted data