In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,OneHotEncoder,VectorAssembler
from pyspark.ml.regression import LinearRegression

spark = SparkSession.Builder().appName('Spark DataFrame Introduction').getOrCreate()
pointsDF = spark.read.option('inferSchema','true')\
                        .option('header','true')\
                        .option('delimiter',';').csv('datasets/evler.csv')

indexer = StringIndexer(inputCol='Sehir',outputCol='indexedSehir') ##Linear modellerde bu kulanılmamalı (Tree-based lerde kötü sonuç vermeyebilir)
indexModel = indexer.fit(pointsDF)
pointsDF = indexModel.transform(pointsDF)
#pointsDF.show()
#Index işleminden sonra Encode işlemi yaparsak en doğrusu olur (Hem linear hemde tree-based algoritmalar için)
encoder = OneHotEncoder(inputCol='indexedSehir',outputCol='encodedSehir')
encodeModel = encoder.fit(pointsDF)
pointsDF = encodeModel.transform(pointsDF)

pointsDF = pointsDF.withColumnRenamed('fiyat','label')
#pointsDF = pointsDF.select('encodedSehir','Oda','salon','label')

vec = VectorAssembler(inputCols=['encodedSehir','Oda','salon'],outputCol='features')
pointsDF = vec.transform(pointsDF)
pointsDF.show()
pointsDF = pointsDF.select('features','label')
pointsDF.show()

lr = LinearRegression() 
model = lr.fit(pointsDF)##Makine Öğrenmesinin yapıldığı satır
print('Bitti')
print("Çarpanlar : ", model.coefficients)
print("Sabit Sayı : ", model.intercept)

+--------+---+-----+------+------------+-------------+-----------------+
|   Sehir|Oda|salon| label|indexedSehir| encodedSehir|         features|
+--------+---+-----+------+------------+-------------+-----------------+
|   izmir|  3|    1|500000|         1.0|(2,[1],[1.0])|[0.0,1.0,3.0,1.0]|
|istanbul|  3|    1|800000|         0.0|(2,[0],[1.0])|[1.0,0.0,3.0,1.0]|
|  ankara|  3|    1|600000|         2.0|    (2,[],[])|[0.0,0.0,3.0,1.0]|
|   izmir|  2|    1|300000|         1.0|(2,[1],[1.0])|[0.0,1.0,2.0,1.0]|
|istanbul|  2|    1|450000|         0.0|(2,[0],[1.0])|[1.0,0.0,2.0,1.0]|
|istanbul|  1|    1|250000|         0.0|(2,[0],[1.0])|[1.0,0.0,1.0,1.0]|
|  ankara|  2|    1|380000|         2.0|    (2,[],[])|[0.0,0.0,2.0,1.0]|
|   izmir|  1|    1|220000|         1.0|(2,[1],[1.0])|[0.0,1.0,1.0,1.0]|
|istanbul|  1|    1|245000|         0.0|(2,[0],[1.0])|[1.0,0.0,1.0,1.0]|
+--------+---+-----+------+------------+-------------+-----------------+

+-----------------+------+
|         features| lab

In [15]:
#SparseVector  --> DenseVector
#(2,[],[])     --> [0,0]
#(2,[1],[1.0]) --> [0,1]
#(2,[0],[1.0]  --> [1,0]
#[1,2,3,4] --> 4 *  32bit
#[11,0,0,0,34,0,23,0,0,0,0,0,....]  --> 1000000 * 32bit --> 4 MB #DenseVector
#(1000000, [0,4,6],[11,34,23] )  --> 7 * 32bit #SparseVector

In [23]:
yenilerDF = spark.read.option('inferSchema','true')\
                        .option('header','true')\
                        .option('delimiter',';').csv('datasets/evler-fiyati-bilinmeyen.csv')
yenilerDF = indexModel.transform(yenilerDF)
yenilerDF = encodeModel.transform(yenilerDF)
yenilerDF = vec.transform(yenilerDF)
yenilerDF.show()
yenilerDF = model.transform(yenilerDF) #Prediction yapıyoruz
yenilerDF.show()

+--------+---+-----+------------+-------------+-----------------+
|   Sehir|Oda|salon|indexedSehir| encodedSehir|         features|
+--------+---+-----+------------+-------------+-----------------+
|   izmir|  3|    1|         1.0|(2,[1],[1.0])|[0.0,1.0,3.0,1.0]|
|istanbul|  3|    1|         0.0|(2,[0],[1.0])|[1.0,0.0,3.0,1.0]|
|  ankara|  3|    1|         2.0|    (2,[],[])|[0.0,0.0,3.0,1.0]|
+--------+---+-----+------------+-------------+-----------------+

+--------+---+-----+------------+-------------+-----------------+-----------------+
|   Sehir|Oda|salon|indexedSehir| encodedSehir|         features|       prediction|
+--------+---+-----+------------+-------------+-----------------+-----------------+
|   izmir|  3|    1|         1.0|(2,[1],[1.0])|[0.0,1.0,3.0,1.0]|555476.1907165312|
|istanbul|  3|    1|         0.0|(2,[0],[1.0])|[1.0,0.0,3.0,1.0]|705595.2392853183|
|  ankara|  3|    1|         2.0|    (2,[],[])|[0.0,0.0,3.0,1.0]|597738.0952982359|
+--------+---+-----+------------+