In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder
from pyspark.ml.regression import RandomForestRegressor

spark = SparkSession.builder.getOrCreate()
houseDF = spark.read.option('header','true').option('delimiter',';').option('inferSchema','true').csv('datasets/house-prices-april6-cities.csv')

houseDF = houseDF.withColumnRenamed('price','label')

indexer = StringIndexer(inputCol='city',outputCol='city_indexed')
houseDF = indexer.fit(houseDF).transform(houseDF)

encoder = OneHotEncoder(inputCol='city_indexed',outputCol='city_encoded')
houseDF = encoder.fit(houseDF).transform(houseDF)

vec =VectorAssembler(inputCols=['area','age','room','city_encoded'],outputCol='features')
houseDF = vec.transform(houseDF)
#houseDF.show(truncate=False)
houseDF = houseDF.select('features','label')
#houseDF.show()
rf = RandomForestRegressor()
rfModel = rf.fit(houseDF)
print('finished')
print(rfModel.toDebugString)

finished
RandomForestRegressionModel: uid=RandomForestRegressor_86c631a63695, numTrees=10, numFeatures=5
  Tree 0 (weight 1.0):
    If (feature 0 <= 115.0)
     Predict: 530000.0
    Else (feature 0 > 115.0)
     Predict: 998000.0
  Tree 1 (weight 1.0):
    If (feature 1 <= 1.0)
     Predict: 998000.0
    Else (feature 1 > 1.0)
     If (feature 4 in {0.0})
      If (feature 3 in {1.0})
       If (feature 2 <= 2.5)
        Predict: 430000.0
       Else (feature 2 > 2.5)
        Predict: 500000.0
      Else (feature 3 not in {1.0})
       Predict: 530000.0
     Else (feature 4 not in {0.0})
      Predict: 530000.0
  Tree 2 (weight 1.0):
    If (feature 2 <= 3.5)
     If (feature 3 in {1.0})
      Predict: 430000.0
     Else (feature 3 not in {1.0})
      Predict: 530000.0
    Else (feature 2 > 3.5)
     Predict: 900000.0
  Tree 3 (weight 1.0):
    If (feature 0 <= 155.0)
     If (feature 3 in {1.0})
      Predict: 430000.0
     Else (feature 3 not in {1.0})
      Predict: 530000.0
    El

In [None]:
#Dense vs Sparse vector rep.
#Istanbul  -->  [1,0]  --> (2,[0],[1.0])
#Ankara    -->  [0,1]  -->(2,[1],[1.0])
#Izmir     -->  [0,0]  -->  (2,[],[])

In [None]:
#[1,2,3,0,0,0,1..........]  -> 4byte * 1000000  #Dense Vector rep.
#[123,33,0,0,0,0,0,0,0,0,0,0,0,0.....] --> 4byte * 1000000

In [None]:
#[123,33,0,9,0,0,0,0,0,0,0,0,0,0.....] --> (1000000,[0,1,3],[123,33,9]) #Sparse Vector rep.

+----+---+----+--------+------+------------+-------------+------------------------------------+--------------------+
|area|age|room|    city| label|city_indexed| city_encoded|VectorAssembler_8b8acceb0840__output|            features|
+----+---+----+--------+------+------------+-------------+------------------------------------+--------------------+
| 120|  5|   3|Istanbul|500000|         0.0|(2,[0],[1.0])|                [120.0,5.0,3.0,1....|[120.0,5.0,3.0,1....|
| 190|  0|   4|  Ankara|998000|         1.0|(2,[1],[1.0])|                [190.0,0.0,4.0,0....|[190.0,0.0,4.0,0....|
|  90|  4|   2|Istanbul|430000|         0.0|(2,[0],[1.0])|                [90.0,4.0,2.0,1.0...|[90.0,4.0,2.0,1.0...|
| 110|  2|   2|   Izmir|530000|         2.0|    (2,[],[])|                [110.0,2.0,2.0,0....|[110.0,2.0,2.0,0....|
| 120|  4|   3|  Ankara|530000|         1.0|(2,[1],[1.0])|                [120.0,4.0,3.0,0....|[120.0,4.0,3.0,0....|
| 200| 10|   4|Istanbul|900000|         0.0|(2,[0],[1.0])|      