## Random forest

In [2]:
import pyspark
import pandas as pd

import os
import sys
from pyspark.sql import SparkSession
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataTransformations").getOrCreate()
spark

In [4]:
path = 'Boston_House_Prices.csv'
df = spark.read.csv(path,header=True,inferSchema=True)
df.show(5)

+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+-----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|PRICE|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+-----+
|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185|61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998|45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147|54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|
+-------+----+-----+----+-----+-----+----+------+---+-----+-------+------+-----+-----+
only showing top 5 rows



In [5]:
df.count(),len(df.columns)

(506, 14)

In [6]:
for col in df.columns:
    print(col)

CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
PRICE


In [7]:
# eksik değerler kontrolü

from pyspark.sql import functions as F

# df içindeki eksik değerleri kontrol et
df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).show(5)

+----+---+-----+----+---+---+---+---+---+---+-------+---+-----+-----+
|CRIM| ZN|INDUS|CHAS|NOX| RM|AGE|DIS|RAD|TAX|PTRATIO|  B|LSTAT|PRICE|
+----+---+-----+----+---+---+---+---+---+---+-------+---+-----+-----+
|   0|  0|    0|   0|  0|  0|  0|  0|  0|  0|      0|  0|    0|    0|
+----+---+-----+----+---+---+---+---+---+---+-------+---+-----+-----+



In [8]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|                NOX|                RM|               AGE|              DIS|              RAD|               TAX|           PTRATIO|                 B|             LSTAT|             PRICE|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|  count|               506|               506|               506|               506|                506|               506|               506|              506|              

In [9]:
# DataFrame şemasını al
schema = df.schema

# Sütunları veri tipine göre gruplandır
grouped_columns = {}
for field in schema.fields:
    data_type = str(field.dataType)
    column_name = field.name
    grouped_columns.setdefault(data_type, []).append(column_name)

# Gruplandırılmış sütunları yazdır
for data_type, columns in grouped_columns.items():
    column_list = ", ".join(columns)
    print(f"{data_type}:\n{column_list}\n")


DoubleType():
CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT, PRICE



In [10]:
# features kolonunu oluşturma

from pyspark.ml.feature import VectorAssembler

x_columns = df.columns[:-1]
assembler = VectorAssembler(inputCols=x_columns, outputCol='features')
output = assembler.transform(df)


df = output.select('features', output['PRICE'].alias('label'))
df.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.00632,18.0,2.3...| 24.0|
|[0.02731,0.0,7.07...| 21.6|
|[0.02729,0.0,7.07...| 34.7|
|[0.03237,0.0,2.18...| 33.4|
|[0.06905,0.0,2.18...| 36.2|
|[0.02985,0.0,2.18...| 28.7|
|[0.08829,12.5,7.8...| 22.9|
|[0.14455,12.5,7.8...| 27.1|
|[0.21124,12.5,7.8...| 16.5|
|[0.17004,12.5,7.8...| 18.9|
|[0.22489,12.5,7.8...| 15.0|
|[0.11747,12.5,7.8...| 18.9|
|[0.09378,12.5,7.8...| 21.7|
|[0.62976,0.0,8.14...| 20.4|
|[0.63796,0.0,8.14...| 18.2|
|[0.62739,0.0,8.14...| 19.9|
|[1.05393,0.0,8.14...| 23.1|
|[0.7842,0.0,8.14,...| 17.5|
|[0.80271,0.0,8.14...| 20.2|
|[0.7258,0.0,8.14,...| 18.2|
+--------------------+-----+
only showing top 20 rows



In [11]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [12]:
print(train.show(5))

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.00632,18.0,2.3...| 24.0|
|[0.00906,90.0,2.9...| 32.2|
|[0.01301,35.0,1.5...| 32.7|
|[0.01311,90.0,1.2...| 35.4|
|[0.0136,75.0,4.0,...| 18.9|
+--------------------+-----+
only showing top 5 rows

None


In [13]:
print(test.show(5))

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.01096,55.0,2.2...| 22.0|
|[0.01381,80.0,0.4...| 50.0|
|[0.01439,60.0,2.9...| 29.1|
|[0.01778,95.0,1.4...| 32.9|
|[0.02177,82.5,2.0...| 42.3|
+--------------------+-----+
only showing top 5 rows

None


In [15]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor()
gbt.setSeed(42)

# Parametre grid'i oluştur
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxDepth, [5, 10, 15])
              .addGrid(gbt.maxBins, [20, 30, 40])
              .build())

# Cross-validation nesnesini oluştur
crossval = CrossValidator(estimator=gbt,
                          estimatorParamMaps=param_grid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)

# Modeli eğit
cv_model = crossval.fit(train)

# En iyi modeli seç
model = cv_model.bestModel


In [16]:
feature_Importances = list(model.featureImportances)
feats = output.columns[:-2]
d = dict(zip(feats, feature_Importances))
sorted_dict = dict(sorted(d.items(), key=lambda item: item[1], reverse=True))

# Sıralı sözlüğü yazdırma
for feature, importance in sorted_dict.items():
    print(f"{feature}: {importance}")


LSTAT: 0.3633204950480282
RM: 0.20350629813268223
DIS: 0.09563236343962037
AGE: 0.077184290037895
TAX: 0.05994216127320513
CRIM: 0.04861801926322124
B: 0.038291070078874744
NOX: 0.032101111498420536
PTRATIO: 0.030945416360422414
INDUS: 0.021439798127227485
RAD: 0.021430720159297766
ZN: 0.00417554085125234
CHAS: 0.0034127157298525232


In [17]:
# Train seti üzerinde tahminler yapın
train_predictions = model.transform(train)

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator

print("Train seti")
evaluator = RegressionEvaluator(metricName="rmse")
print("RMSE = %g" % evaluator.evaluate(train_predictions))

evaluator = RegressionEvaluator(metricName="r2")
print("R Squared (R2) = %g" % evaluator.evaluate(train_predictions))

evaluator = RegressionEvaluator(metricName="mse")
print("MSE = %g" % evaluator.evaluate(train_predictions))

Train seti
RMSE = 1.28325
R Squared (R2) = 0.979583
MSE = 1.64674


In [19]:
test.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.01096,55.0,2.2...| 22.0|
|[0.01381,80.0,0.4...| 50.0|
|[0.01439,60.0,2.9...| 29.1|
+--------------------+-----+
only showing top 3 rows



In [20]:
predictions = model.transform(test)
predictions.show(5)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.01096,55.0,2.2...| 22.0| 24.20113362991416|
|[0.01381,80.0,0.4...| 50.0| 47.82411261237669|
|[0.01439,60.0,2.9...| 29.1|25.871528653909127|
|[0.01778,95.0,1.4...| 32.9| 33.15614853367658|
|[0.02177,82.5,2.0...| 42.3| 47.69170633525798|
+--------------------+-----+------------------+
only showing top 5 rows



In [21]:
from pyspark.ml.evaluation import RegressionEvaluator

print("Test seti")
evaluator = RegressionEvaluator(metricName="rmse")
print("RMSE = %g" % evaluator.evaluate(predictions))

evaluator = RegressionEvaluator(metricName="r2")
print("R Squared (R2) = %g" % evaluator.evaluate(predictions))

evaluator = RegressionEvaluator(metricName="mse")
print("MSE = %g" % evaluator.evaluate(predictions))

print()

print("Train seti")
evaluator = RegressionEvaluator(metricName="rmse")
print("RMSE = %g" % evaluator.evaluate(train_predictions))

evaluator = RegressionEvaluator(metricName="r2")
print("R Squared (R2) = %g" % evaluator.evaluate(train_predictions))

evaluator = RegressionEvaluator(metricName="mse")
print("MSE = %g" % evaluator.evaluate(train_predictions))

Test seti
RMSE = 2.88197
R Squared (R2) = 0.921285
MSE = 8.30575

Train seti
RMSE = 1.28325
R Squared (R2) = 0.979583
MSE = 1.64674
