In [3]:
#Tree-based model
def housePrice(area,age,room):
    if area > 120 :
        if room < 3:
            return 650000
        else:
            return 800000
    else:
        if room < 3:
            return 550000
        else:
            return 600000

print(housePrice(110,5,4,23.5,12.3))

600000


In [5]:
#Linear model
def housePrice2(area,age,room):
    return 1000*area + -20000*age + 100000*room 
print(housePrice2(110,5,4,23.5,12.3))

410000


In [40]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

spark = SparkSession.builder.getOrCreate()
houseDF = spark.read.option('header','true').option('delimiter',';').option('inferSchema','true').csv('datasets/house-prices-april6.csv')

houseDF = houseDF.withColumnRenamed('price','label')
vec = VectorAssembler(inputCols=['area','age','room'],outputCol='features')
houseDF = vec.transform(houseDF)
houseDF.show()

dtRegressor = DecisionTreeRegressor()
model = dtRegressor.fit(houseDF) #Machine learned!!!!
print('Machine learned')

+----+---+----+------+----------------+
|area|age|room| label|        features|
+----+---+----+------+----------------+
| 120|  5|   3|500000| [120.0,5.0,3.0]|
| 190|  0|   4|998000| [190.0,0.0,4.0]|
|  90|  4|   2|430000|  [90.0,4.0,2.0]|
| 110|  2|   2|530000| [110.0,2.0,2.0]|
| 120|  4|   3|530000| [120.0,4.0,3.0]|
| 200| 10|   4|900000|[200.0,10.0,4.0]|
+----+---+----+------+----------------+

Machine learned


In [34]:
print(model.toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_f50d61f8ec7b, depth=3, numNodes=9, numFeatures=3
  If (feature 0 <= 155.0)
   If (feature 0 <= 100.0)
    Predict: 430000.0
   Else (feature 0 > 100.0)
    If (feature 1 <= 5.0)
     Predict: 530000.0
    Else (feature 1 > 5.0)
     Predict: 500000.0
  Else (feature 0 > 155.0)
   If (feature 0 <= 195.0)
    Predict: 998000.0
   Else (feature 0 > 195.0)
    Predict: 900000.0



In [35]:
def learnedTreeBasedModelFromData(area,age,room):
    if (area <= 155.0):
        if (area <= 100.0):
            return 430000.0
        elif (area > 100.0):
            if(age <= 5.0):
                return 530000.0
            elif (age > 5.0):
                return 500000.0
    elif (area  > 155.0):
        if (area  <= 195.0):
            return 998000.0
        else:
            return 900000.0

In [38]:
print( learnedTreeBasedModelFromData(120,6,3))
newhousesDF = spark.read.option('header','true').option('delimiter',';').option('inferSchema','true').csv('datasets/newhouses-april6.csv')
newhousesDF = vec.transform(newhousesDF)
newhousesDF = model.transform(newhousesDF) #Prediction
newhousesDF.show()

900000.0
+----+---+----+----------------+----------+
|area|age|room|        features|prediction|
+----+---+----+----------------+----------+
| 180| 10|   3|[180.0,10.0,3.0]|  998000.0|
| 210|  5|   4| [210.0,5.0,4.0]|  900000.0|
|  90|  3|   2|  [90.0,3.0,2.0]|  430000.0|
| 170|  3|   3| [170.0,3.0,3.0]|  998000.0|
+----+---+----+----------------+----------+



In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

spark = SparkSession.builder.getOrCreate()
houseDF = spark.read.option('header','true').option('delimiter',';').option('inferSchema','true').csv('datasets/house-prices-april6.csv')

houseDF = houseDF.withColumnRenamed('price','label')
vec = VectorAssembler(inputCols=['area','age','room'],outputCol='features')
houseDF = vec.transform(houseDF)
houseDF.show()

linearRegressor = LinearRegression()
model = linearRegressor.fit(houseDF) #Machine learned!!!!
print('Machine learned')

+----+---+----+------+----------------+
|area|age|room| label|        features|
+----+---+----+------+----------------+
| 120|  6|   3|500000| [120.0,6.0,3.0]|
| 190|  0|   4|998000| [190.0,0.0,4.0]|
|  90|  4|   2|430000|  [90.0,4.0,2.0]|
| 110|  2|   2|530000| [110.0,2.0,2.0]|
| 120|  4|   3|530000| [120.0,4.0,3.0]|
| 200| 10|   4|900000|[200.0,10.0,4.0]|
+----+---+----+------+----------------+

Machine learned


In [27]:
print('Coefs : ',model.coefficients)
print('Intercept : ',model.intercept)

Coefs :  [6137.628742514902,-15272.574850299521,-41764.790419158424]
Intercept :  -9563.113772454697


In [28]:
def learnedLinearModelFromData(area,age,room):
    price = 6137 * area + (-15272)*age + (-41764)*room + (-9563)
    return price

learnedLinearModelFromData(120,6,3)

509953

In [31]:
newhousesDF = spark.read.option('header','true').option('delimiter',';').option('inferSchema','true').csv('datasets/newhouses-april6.csv')
newhousesDF = vec.transform(newhousesDF)
newhousesDF = model.transform(newhousesDF) #Prediction
newhousesDF.show()

+----+---+----+----------------+------------------+
|area|age|room|        features|        prediction|
+----+---+----+----------------+------------------+
| 180| 10|   3|[180.0,10.0,3.0]| 817189.9401197573|
| 210|  5|   4| [210.0,5.0,4.0]|1035916.8862275435|
|  90|  3|   2|  [90.0,3.0,2.0]| 413476.1676646712|
| 170|  3|   3| [170.0,3.0,3.0]| 862721.6766467049|
+----+---+----+----------------+------------------+

