In [270]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.appName('Linear_regression').getOrCreate()
data = spark.read.csv('./car_prices.csv', header=True, inferSchema=True)
data.show(3)

+----+----+--------+----------+-----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-------+------------+--------------------+
|year|make|   model|      trim| body|transmission|              vin|state|condition|odometer|color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+--------+----------+-----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-------+------------+--------------------+
|2015| Kia| Sorento|        LX|  SUV|   automatic|5xyktca69fg566472|   ca|      5.0| 16639.0|white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015| Kia| Sorento|        LX|  SUV|   automatic|5xyktca69fg561319|   ca|      5.0|  9393.0|white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014| BMW|3 Series|328i SULEV|Sedan|   automatic|wba3c1c51ek116351|   ca|     45.0|  1331.0| gray|   black|financial service...|31900.0|   

In [271]:
data.printSchema()

root
 |-- year: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- model: string (nullable = true)
 |-- trim: string (nullable = true)
 |-- body: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- vin: string (nullable = true)
 |-- state: string (nullable = true)
 |-- condition: double (nullable = true)
 |-- odometer: double (nullable = true)
 |-- color: string (nullable = true)
 |-- interior: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- mmr: double (nullable = true)
 |-- sellingprice: double (nullable = true)
 |-- saledate: string (nullable = true)



In [272]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

# 创建StringIndexer对象
stringIndexer = StringIndexer(inputCol="make", outputCol="make_index")

# 使用StringIndexer对象进行转换
indexedData = stringIndexer.fit(data).transform(data)

# 将"make_index"列转换为整数类型
indexedData = indexedData.withColumn("make_index", col("make_index").cast(IntegerType()))

# 替换"make"列为"make_index"列
indexedData = indexedData.withColumn("make", col("make_index")).drop("make_index")

# 查看转换后的数据
data = indexedData
data.show()

+----+----+-------------------+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|year|make|              model|                trim|       body|transmission|              vin|state|condition|odometer| color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-------------------+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|2015|   8|            Sorento|                  LX|        SUV|   automatic|5xyktca69fg566472|   ca|      5.0| 16639.0| white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|            Sorento|                  LX|        SUV|   automatic|5xyktca69fg561319|   ca|      5.0|  9393.0| white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|


In [273]:
# 处理state列
stringIndexer = StringIndexer(inputCol="state", outputCol="state_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("state_index", col("state_index").cast(IntegerType()))
indexedData = indexedData.withColumn("state", col("state_index")).drop("state_index")

data = indexedData
data.show()

+----+----+-------------------+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|year|make|              model|                trim|       body|transmission|              vin|state|condition|odometer| color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-------------------+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|2015|   8|            Sorento|                  LX|        SUV|   automatic|5xyktca69fg566472|    1|      5.0| 16639.0| white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|            Sorento|                  LX|        SUV|   automatic|5xyktca69fg561319|    1|      5.0|  9393.0| white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|


In [274]:
# 处理model列
stringIndexer = StringIndexer(inputCol="model", outputCol="model_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("model_index", col("model_index").cast(IntegerType()))
indexedData = indexedData.withColumn("model", col("model_index")).drop("model_index")

data = indexedData
data.show()

+----+----+-----+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|year|make|model|                trim|       body|transmission|              vin|state|condition|odometer| color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-----+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|2015|   8|   40|                  LX|        SUV|   automatic|5xyktca69fg566472|    1|      5.0| 16639.0| white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|                  LX|        SUV|   automatic|5xyktca69fg561319|    1|      5.0|  9393.0| white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8|          328i SULEV|      Sedan|   automatic|wba3c1c

In [275]:
# 处理transmission
stringIndexer = StringIndexer(inputCol="transmission", outputCol="transmission_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("transmission_index", col("transmission_index").cast(IntegerType()))
indexedData = indexedData.withColumn("transmission", col("transmission_index")).drop("transmission_index")

data = indexedData
data.show()

+----+----+-----+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|year|make|model|                trim|       body|transmission|              vin|state|condition|odometer| color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-----+--------------------+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|2015|   8|   40|                  LX|        SUV|           0|5xyktca69fg566472|    1|      5.0| 16639.0| white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|                  LX|        SUV|           0|5xyktca69fg561319|    1|      5.0|  9393.0| white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8|          328i SULEV|      Sedan|           0|wba3c1c

In [276]:
# 处理trim
stringIndexer = StringIndexer(inputCol="trim", outputCol="trim_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("trim_index", col("trim_index").cast(IntegerType()))
indexedData = indexedData.withColumn("trim", col("trim_index")).drop("trim_index")

data = indexedData
data.show()

+----+----+-----+----+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|year|make|model|trim|       body|transmission|              vin|state|condition|odometer| color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-----+----+-----------+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|2015|   8|   40|   2|        SUV|           0|5xyktca69fg566472|    1|      5.0| 16639.0| white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|   2|        SUV|           0|5xyktca69fg561319|    1|      5.0|  9393.0| white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8| 636|      Sedan|           0|wba3c1c51ek116351|    1|     45.0|  1331.0|  gray|   black|financial service...|31900.0|     30000.0|Th

In [277]:
# 处理body
stringIndexer = StringIndexer(inputCol="body", outputCol="body_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("body_index", col("body_index").cast(IntegerType()))
indexedData = indexedData.withColumn("body", col("body_index")).drop("body_index")

data = indexedData
data.show()

+----+----+-----+----+----+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|year|make|model|trim|body|transmission|              vin|state|condition|odometer| color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-----+----+----+------------+-----------------+-----+---------+--------+------+--------+--------------------+-------+------------+--------------------+
|2015|   8|   40|   2|   1|           0|5xyktca69fg566472|    1|      5.0| 16639.0| white|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|   2|   1|           0|5xyktca69fg561319|    1|      5.0|  9393.0| white|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8| 636|   0|           0|wba3c1c51ek116351|    1|     45.0|  1331.0|  gray|   black|financial service...|31900.0|     30000.0|Thu Jan 15 2015 0...|
|2015|  26|  127|  91|

In [278]:
# 处理color
stringIndexer = StringIndexer(inputCol="color", outputCol="color_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("color_index", col("color_index").cast(IntegerType()))
indexedData = indexedData.withColumn("color", col("color_index")).drop("color_index")

data = indexedData
data.show()

+----+----+-----+----+----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-------+------------+--------------------+
|year|make|model|trim|body|transmission|              vin|state|condition|odometer|color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-----+----+----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-------+------------+--------------------+
|2015|   8|   40|   2|   1|           0|5xyktca69fg566472|    1|      5.0| 16639.0|    1|   black|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|   2|   1|           0|5xyktca69fg561319|    1|      5.0|  9393.0|    1|   beige|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8| 636|   0|           0|wba3c1c51ek116351|    1|     45.0|  1331.0|    3|   black|financial service...|31900.0|     30000.0|Thu Jan 15 2015 0...|
|2015|  26|  127|  91|   0| 

In [279]:
# 处理interior
stringIndexer = StringIndexer(inputCol="interior", outputCol="interior_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("interior_index", col("interior_index").cast(IntegerType()))
indexedData = indexedData.withColumn("interior", col("interior_index")).drop("interior_index")

data = indexedData
data.show()

+----+----+-----+----+----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-------+------------+--------------------+
|year|make|model|trim|body|transmission|              vin|state|condition|odometer|color|interior|              seller|    mmr|sellingprice|            saledate|
+----+----+-----+----+----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-------+------------+--------------------+
|2015|   8|   40|   2|   1|           0|5xyktca69fg566472|    1|      5.0| 16639.0|    1|       0|kia motors americ...|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|   2|   1|           0|5xyktca69fg561319|    1|      5.0|  9393.0|    1|       2|kia motors americ...|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8| 636|   0|           0|wba3c1c51ek116351|    1|     45.0|  1331.0|    3|       0|financial service...|31900.0|     30000.0|Thu Jan 15 2015 0...|
|2015|  26|  127|  91|   0| 

In [280]:
# 处理seller
stringIndexer = StringIndexer(inputCol="seller", outputCol="seller_index")
indexedData = stringIndexer.fit(data).transform(data)
indexedData = indexedData.withColumn("seller_index", col("seller_index").cast(IntegerType()))
indexedData = indexedData.withColumn("seller", col("seller_index")).drop("seller_index")

data = indexedData
data.show()

+----+----+-----+----+----+------------+-----------------+-----+---------+--------+-----+--------+------+-------+------------+--------------------+
|year|make|model|trim|body|transmission|              vin|state|condition|odometer|color|interior|seller|    mmr|sellingprice|            saledate|
+----+----+-----+----+----+------------+-----------------+-----+---------+--------+-----+--------+------+-------+------------+--------------------+
|2015|   8|   40|   2|   1|           0|5xyktca69fg566472|    1|      5.0| 16639.0|    1|       0|    23|20500.0|     21500.0|Tue Dec 16 2014 1...|
|2015|   8|   40|   2|   1|           0|5xyktca69fg561319|    1|      5.0|  9393.0|    1|       2|    23|20800.0|     21500.0|Tue Dec 16 2014 1...|
|2014|   7|    8| 636|   0|           0|wba3c1c51ek116351|    1|     45.0|  1331.0|    3|       0|    14|31900.0|     30000.0|Thu Jan 15 2015 0...|
|2015|  26|  127|  91|   0|           0|yv1612tb4f1310987|    1|     41.0| 14282.0|    1|       0|   123|27500.0

In [281]:
# 显示"saledate"列的前10条数据
saledate_data = data.select("saledate").head(10)

# 打印前10条数据
for row in saledate_data:
    print(row.saledate)

from pyspark.sql.functions import substring, unix_timestamp


Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
Thu Dec 18 2014 12:30:00 GMT-0800 (PST)
Tue Dec 30 2014 12:00:00 GMT-0800 (PST)
Wed Dec 17 2014 12:30:00 GMT-0800 (PST)
Tue Dec 16 2014 13:00:00 GMT-0800 (PST)
Thu Dec 18 2014 12:00:00 GMT-0800 (PST)
Tue Jan 20 2015 04:00:00 GMT-0800 (PST)


In [282]:
from pyspark.sql.functions import substring, unix_timestamp

# 将"timestamp_string"列转换为Unix时间戳
data = data.withColumn("timestamp_string", substring("saledate", 5, 20))
data = data.withColumn("timestamp", unix_timestamp("timestamp_string", "MMM dd yyyy HH:mm:ss"))

data = data.drop('saledate')
data = data.drop('timestamp_string')
data = data.drop('vin')

# 查看转换后的数据
data.show()

+----+----+-----+----+----+------------+-----+---------+--------+-----+--------+------+-------+------------+----------+
|year|make|model|trim|body|transmission|state|condition|odometer|color|interior|seller|    mmr|sellingprice| timestamp|
+----+----+-----+----+----+------------+-----+---------+--------+-----+--------+------+-------+------------+----------+
|2015|   8|   40|   2|   1|           0|    1|      5.0| 16639.0|    1|       0|    23|20500.0|     21500.0|1418704200|
|2015|   8|   40|   2|   1|           0|    1|      5.0|  9393.0|    1|       2|    23|20800.0|     21500.0|1418704200|
|2014|   7|    8| 636|   0|           0|    1|     45.0|  1331.0|    3|       0|    14|31900.0|     30000.0|1421267400|
|2015|  26|  127|  91|   0|           0|    1|     41.0| 14282.0|    1|       0|   123|27500.0|     27750.0|1422477000|
|2014|   7|  398| 139|   0|           0|    1|     43.0|  2641.0|    3|       0|    14|66000.0|     67000.0|1418877000|
|2015|   2|    0|  12|   0|           0|

In [283]:
# 将"timestamp"列丢弃末两位
data = data.withColumn("timestamp_str", data["timestamp"].cast("string"))
data = data.withColumn("timestamp_str_truncated", substring("timestamp_str", 0, 8))
data = data.withColumn("timestamp_int", data["timestamp_str_truncated"].cast("integer"))

data = data.drop('timestamp_str')
data = data.drop('timestamp_str_truncated')
data = data.drop('timestamp')
data = data.withColumnRenamed("timestamp_int", "timestamp")


data.show()

+----+----+-----+----+----+------------+-----+---------+--------+-----+--------+------+-------+------------+---------+
|year|make|model|trim|body|transmission|state|condition|odometer|color|interior|seller|    mmr|sellingprice|timestamp|
+----+----+-----+----+----+------------+-----+---------+--------+-----+--------+------+-------+------------+---------+
|2015|   8|   40|   2|   1|           0|    1|      5.0| 16639.0|    1|       0|    23|20500.0|     21500.0| 14187042|
|2015|   8|   40|   2|   1|           0|    1|      5.0|  9393.0|    1|       2|    23|20800.0|     21500.0| 14187042|
|2014|   7|    8| 636|   0|           0|    1|     45.0|  1331.0|    3|       0|    14|31900.0|     30000.0| 14212674|
|2015|  26|  127|  91|   0|           0|    1|     41.0| 14282.0|    1|       0|   123|27500.0|     27750.0| 14224770|
|2014|   7|  398| 139|   0|           0|    1|     43.0|  2641.0|    3|       0|    14|66000.0|     67000.0| 14188770|
|2015|   2|    0|  12|   0|           0|    1|  

In [284]:
data.printSchema()
print(data.dtypes)

root
 |-- year: integer (nullable = true)
 |-- make: integer (nullable = true)
 |-- model: integer (nullable = true)
 |-- trim: integer (nullable = true)
 |-- body: integer (nullable = true)
 |-- transmission: integer (nullable = true)
 |-- state: integer (nullable = true)
 |-- condition: double (nullable = true)
 |-- odometer: double (nullable = true)
 |-- color: integer (nullable = true)
 |-- interior: integer (nullable = true)
 |-- seller: integer (nullable = true)
 |-- mmr: double (nullable = true)
 |-- sellingprice: double (nullable = true)
 |-- timestamp: integer (nullable = true)

[('year', 'int'), ('make', 'int'), ('model', 'int'), ('trim', 'int'), ('body', 'int'), ('transmission', 'int'), ('state', 'int'), ('condition', 'double'), ('odometer', 'double'), ('color', 'int'), ('interior', 'int'), ('seller', 'int'), ('mmr', 'double'), ('sellingprice', 'double'), ('timestamp', 'int')]


In [285]:
# 1. 检查数据中是否存在 null 值
null_cols = [col for col in data.columns if data.select(col).where(data[col].isNull()).count() > 0]
if null_cols:
    print(f"Null values found in columns: {', '.join(null_cols)}")

In [286]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['year', 'make', 'model', 'trim','body', 'transmission', 'state', 'condition', 'odometer', 'color', 'interior', 'seller', 'mmr', 'timestamp'], outputCol = 'features')
output = assembler.transform(data)
output.take(1)

[Row(year=2015, make=8, model=40, trim=2, body=1, transmission=0, state=1, condition=5.0, odometer=16639.0, color=1, interior=0, seller=23, mmr=20500.0, sellingprice=21500.0, timestamp=14187042, features=DenseVector([2015.0, 8.0, 40.0, 2.0, 1.0, 0.0, 1.0, 5.0, 16639.0, 1.0, 0.0, 23.0, 20500.0, 14187042.0]))]

In [287]:
output.printSchema()

root
 |-- year: integer (nullable = true)
 |-- make: integer (nullable = true)
 |-- model: integer (nullable = true)
 |-- trim: integer (nullable = true)
 |-- body: integer (nullable = true)
 |-- transmission: integer (nullable = true)
 |-- state: integer (nullable = true)
 |-- condition: double (nullable = true)
 |-- odometer: double (nullable = true)
 |-- color: integer (nullable = true)
 |-- interior: integer (nullable = true)
 |-- seller: integer (nullable = true)
 |-- mmr: double (nullable = true)
 |-- sellingprice: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- features: vector (nullable = true)



In [288]:
final_data = output.select(['features', 'sellingprice'])
final_data = final_data.dropna()
final_data.show(3, truncate=False)

+-------------------------------------------------------------------------------+------------+
|features                                                                       |sellingprice|
+-------------------------------------------------------------------------------+------------+
|[2015.0,8.0,40.0,2.0,1.0,0.0,1.0,5.0,16639.0,1.0,0.0,23.0,20500.0,1.4187042E7] |21500.0     |
|[2015.0,8.0,40.0,2.0,1.0,0.0,1.0,5.0,9393.0,1.0,2.0,23.0,20800.0,1.4187042E7]  |21500.0     |
|[2014.0,7.0,8.0,636.0,0.0,0.0,1.0,45.0,1331.0,3.0,0.0,14.0,31900.0,1.4212674E7]|30000.0     |
+-------------------------------------------------------------------------------+------------+
only showing top 3 rows



In [289]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- sellingprice: double (nullable = true)



In [290]:
train, test = final_data.randomSplit([0.8,0.2])

In [291]:
lr = LinearRegression(featuresCol='features',labelCol = 'sellingprice')
lr_model = lr.fit(train)

24/04/18 23:32:04 WARN Instrumentation: [db219cd9] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

24/04/18 23:32:06 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [292]:
test_result = lr_model.evaluate(test)
test_result.residuals.show()



+-------------------+
|          residuals|
+-------------------+
|  649.7156936336105|
| -448.7918031239824|
| -103.8415829329897|
|-2213.6102735144523|
|  592.5781652863079|
|-210.76295282732463|
| -1156.676953950111|
|-1598.8941669862834|
| 2491.8675940604444|
|-1570.2115097727365|
| -654.9342387894139|
| 284.35505812345946|
|   661.407095612245|
|  681.9532207576412|
|    1315.2593939207|
|  37.65987456556468|
|  579.9958578528021|
| 240.50897955577238|
| 317.71410721387656|
| -666.5979251225071|
+-------------------+
only showing top 20 rows



In [293]:
test_result.rootMeanSquaredError

1630.70399003853

In [294]:
test_result.r2

0.9714209771385157

In [295]:
final_data.describe().show()

[Stage 626:===>                                                   (1 + 15) / 16]

+-------+-----------------+
|summary|     sellingprice|
+-------+-----------------+
|  count|           472325|
|   mean|13690.51205843434|
| stddev|9613.033737691603|
|    min|              1.0|
|    max|         230000.0|
+-------+-----------------+



                                                                                

In [296]:
unlabeled_data = test.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(14,[0,1,2,6,7,8,...|
|(14,[0,1,2,7,8,11...|
|(14,[0,1,2,7,8,11...|
|(14,[0,1,2,7,8,11...|
|(14,[0,1,2,7,8,11...|
|(14,[0,1,2,7,8,11...|
|(14,[0,1,2,7,8,11...|
|(14,[0,1,3,7,8,11...|
|(14,[0,2,3,4,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
|(14,[0,2,3,6,7,8,...|
+--------------------+
only showing top 20 rows



In [297]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|(14,[0,1,2,6,7,8,...| 15750.28430636639|
|(14,[0,1,2,7,8,11...| 948.7918031239824|
|(14,[0,1,2,7,8,11...| 903.8415829329897|
|(14,[0,1,2,7,8,11...| 5313.610273514452|
|(14,[0,1,2,7,8,11...|30407.421834713692|
|(14,[0,1,2,7,8,11...|34610.762952827325|
|(14,[0,1,2,7,8,11...| 29056.67695395011|
|(14,[0,1,3,7,8,11...| 9898.894166986283|
|(14,[0,2,3,4,7,8,...| 8908.132405939556|
|(14,[0,2,3,6,7,8,...|4870.2115097727365|
|(14,[0,2,3,6,7,8,...| 6954.934238789414|
|(14,[0,2,3,6,7,8,...| 12315.64494187654|
|(14,[0,2,3,6,7,8,...|12838.592904387755|
|(14,[0,2,3,6,7,8,...| 21418.04677924236|
|(14,[0,2,3,6,7,8,...|  14484.7406060793|
|(14,[0,2,3,6,7,8,...|14862.340125434435|
|(14,[0,2,3,6,7,8,...|18720.004142147198|
|(14,[0,2,3,6,7,8,...|18659.491020444228|
|(14,[0,2,3,6,7,8,...|12382.285892786123|
|(14,[0,2,3,6,7,8,...|12266.597925122507|
+--------------------+------------