In [112]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics

In [80]:
data = spark.read.format('csv').load('obesity_dummy.csv', inferSchema=True, header=True)

In [81]:
data.columns

['_c0',
 'age',
 'height_rounded',
 'weight_rounded',
 'bmi',
 'veges_freq_Always',
 'veges_freq_Never',
 'veges_freq_Sometimes',
 'main_meals_num_More than 3',
 'main_meals_num_One/Two',
 'main_meals_num_Three',
 'daily_water_consumption_1-2L',
 'daily_water_consumption_<1L',
 'daily_water_consumption_>2L',
 'physical_activity_freq_0 days',
 'physical_activity_freq_1-2 days',
 'physical_activity_freq_2-4 days',
 'physical_activity_freq_4-5 days',
 'tech_devices_usage_0-2h',
 'tech_devices_usage_3-5h',
 'tech_devices_usage_>5h',
 'high_kcal_food_no',
 'high_kcal_food_yes',
 'transport_used_Automobile',
 'transport_used_Bike',
 'transport_used_Motorbike',
 'transport_used_Public_Transportation',
 'transport_used_Walking',
 'snacks_consuming_Always',
 'snacks_consuming_Frequently',
 'snacks_consuming_Sometimes',
 'snacks_consuming_no',
 'smoking_no',
 'smoking_yes',
 'kcal_monitoring_no',
 'kcal_monitoring_yes',
 'alcohol_consumption_Always',
 'alcohol_consumption_Frequently',
 'alcohol_

In [82]:
featureAssembler = VectorAssembler(inputCols=['age',
 'veges_freq_Always',
 'veges_freq_Never',
 'veges_freq_Sometimes',
 'main_meals_num_More than 3',
 'main_meals_num_One/Two',
 'main_meals_num_Three',
 'daily_water_consumption_1-2L',
 'daily_water_consumption_<1L',
 'daily_water_consumption_>2L',
 'physical_activity_freq_0 days',
 'physical_activity_freq_1-2 days',
 'physical_activity_freq_2-4 days',
 'physical_activity_freq_4-5 days',
 'tech_devices_usage_0-2h',
 'tech_devices_usage_3-5h',
 'tech_devices_usage_>5h',
 'high_kcal_food_no',
 'high_kcal_food_yes',
 'transport_used_Automobile',
 'transport_used_Bike',
 'transport_used_Motorbike',
 'transport_used_Public_Transportation',
 'transport_used_Walking',
 'snacks_consuming_Always',
 'snacks_consuming_Frequently',
 'snacks_consuming_Sometimes',
 'snacks_consuming_no',
 'smoking_no',
 'smoking_yes',
 'kcal_monitoring_no',
 'kcal_monitoring_yes',
 'alcohol_consumption_Always',
 'alcohol_consumption_Frequently',
 'alcohol_consumption_Sometimes',
 'alcohol_consumption_no',
 'gender_Female',
 'gender_Male'], outputCol='features')

In [83]:
output=featureAssembler.transform(data)

In [84]:
output.select("features").show()

+--------------------+
|            features|
+--------------------+
|(38,[0,3,6,7,10,1...|
|(38,[0,1,6,9,12,1...|
|(38,[0,3,6,7,13,1...|
|(38,[0,1,6,7,13,1...|
|(38,[0,3,5,7,10,1...|
|(38,[0,3,6,7,10,1...|
|(38,[0,1,6,7,11,1...|
|(38,[0,3,6,7,12,1...|
|(38,[0,1,6,7,11,1...|
|(38,[0,3,6,7,11,1...|
|(38,[0,1,6,9,13,1...|
|(38,[0,3,6,7,13,1...|
|(38,[0,1,6,9,13,1...|
|(38,[0,3,6,7,13,1...|
|(38,[0,1,5,8,11,1...|
|(38,[0,1,6,7,13,1...|
|(38,[0,3,5,8,11,1...|
|(38,[0,3,5,7,10,1...|
|(38,[0,1,4,8,10,1...|
|(38,[0,3,5,7,10,1...|
+--------------------+
only showing top 20 rows



In [85]:
final_data = output.select('features', 'bmi')
final_data.show()

+--------------------+------------------+
|            features|               bmi|
+--------------------+------------------+
|(38,[0,3,6,7,10,1...|19.753086419753085|
|(38,[0,1,6,9,12,1...| 18.42105263157895|
|(38,[0,3,6,7,13,1...| 21.38888888888889|
|(38,[0,1,6,7,13,1...|24.166666666666664|
|(38,[0,3,5,7,10,1...|25.224719101123597|
|(38,[0,3,6,7,10,1...|16.358024691358022|
|(38,[0,1,6,7,11,1...| 18.33333333333333|
|(38,[0,3,6,7,12,1...|16.158536585365855|
|(38,[0,1,6,7,11,1...| 17.97752808988764|
|(38,[0,3,6,7,11,1...|19.767441860465123|
|(38,[0,1,6,9,13,1...|28.378378378378372|
|(38,[0,3,6,7,13,1...|23.255813953488367|
|(38,[0,1,6,9,13,1...| 16.96969696969697|
|(38,[0,3,6,7,13,1...|              27.5|
|(38,[0,1,5,8,11,1...|16.949152542372882|
|(38,[0,1,6,7,13,1...|19.411764705882355|
|(38,[0,3,5,8,11,1...|26.424870466321245|
|(38,[0,3,5,7,10,1...|25.490196078431374|
|(38,[0,1,4,8,10,1...|23.976608187134506|
|(38,[0,3,5,7,10,1...| 21.21212121212121|
+--------------------+------------

In [86]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [87]:
regressor=LinearRegression(featuresCol='features', labelCol='bmi')
regressor=regressor.fit(train_data)

In [88]:
regressor.coefficients

DenseVector([0.31, 1.738, -1.3919, -1.4819, -2.5566, -0.858, 1.4872, -0.0859, -1.3125, 1.3937, 0.3895, 0.1536, -1.9568, -0.1296, -0.2269, 0.705, -1.1645, -1.645, 1.645, -1.7624, -2.427, 1.1122, 1.6406, -0.2302, -0.6924, -3.7609, 3.0344, -0.869, -0.3011, 0.3011, 1.508, -1.508, 4.9517, -0.6655, 0.7581, -0.7106, -0.8507, 0.8507])

In [89]:
regressor.intercept

11.380997499904256

In [90]:
pred_results=regressor.evaluate(test_data)

In [91]:
pred_results.predictions.show()

+--------------------+------------------+------------------+
|            features|               bmi|        prediction|
+--------------------+------------------+------------------+
|(38,[0,1,4,7,10,1...|14.792899408284025| 20.65955818778487|
|(38,[0,1,4,7,10,1...|17.714285714285715| 20.09183465401273|
|(38,[0,1,4,7,10,1...|15.121301775147927|17.561344262091495|
|(38,[0,1,4,7,13,1...|22.279792746113987|15.948023202171772|
|(38,[0,1,4,7,13,1...|16.095628415300546|19.963945544878637|
|(38,[0,1,4,7,13,1...|16.216216216216214|19.963945544878637|
|(38,[0,1,4,7,13,1...|16.304347826086953|19.963945544878637|
|(38,[0,1,4,7,13,1...|16.670391061452513|19.963945544878637|
|(38,[0,1,4,7,13,1...|23.795580110497237|25.455690463058634|
|(38,[0,1,4,7,13,1...|14.892215568862277|14.600872339231124|
|(38,[0,1,4,7,13,1...|16.016304347826086| 20.89584381765748|
|(38,[0,1,4,7,13,1...|14.267241379310345|24.686231904006796|
|(38,[0,1,4,8,10,1...|23.976608187134506|15.082210617755473|
|(38,[0,1,4,8,13,1...| 2

In [109]:
evaluator = RegressionEvaluator(
    labelCol="bmi", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred_results.predictions)
mae

4.730916683657211

In [95]:
rf = RandomForestRegressor(featuresCol="features", labelCol='bmi')
# pipeline = Pipeline(stages=[featureAssembler, rf])
model = rf.fit(train_data)

In [98]:
predictions = model.transform(test_data)

In [99]:
predictions.show()

+--------------------+------------------+------------------+
|            features|               bmi|        prediction|
+--------------------+------------------+------------------+
|(38,[0,1,4,7,10,1...|14.792899408284025|15.849775807434076|
|(38,[0,1,4,7,10,1...|17.714285714285715|19.970701413526378|
|(38,[0,1,4,7,10,1...|15.121301775147927|15.577258864280656|
|(38,[0,1,4,7,13,1...|22.279792746113987|20.831729926846144|
|(38,[0,1,4,7,13,1...|16.095628415300546|18.183141695699895|
|(38,[0,1,4,7,13,1...|16.216216216216214|18.183141695699895|
|(38,[0,1,4,7,13,1...|16.304347826086953|18.183141695699895|
|(38,[0,1,4,7,13,1...|16.670391061452513|18.183141695699895|
|(38,[0,1,4,7,13,1...|23.795580110497237|23.371598391284408|
|(38,[0,1,4,7,13,1...|14.892215568862277| 15.92640199387299|
|(38,[0,1,4,7,13,1...|16.016304347826086|18.487388216018253|
|(38,[0,1,4,7,13,1...|14.267241379310345|23.230104490979937|
|(38,[0,1,4,8,10,1...|23.976608187134506|17.837775767925613|
|(38,[0,1,4,8,13,1...| 2

In [105]:
evaluator = RegressionEvaluator(
    labelCol="bmi", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
mae

4.005118884415921

4.005118884415921