In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Spark Configuration

spark = SparkSession \
    .builder\
    .master('local')\
    .config('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/propertify')\
    .config('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/propertify')\
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.1')\
    .getOrCreate()

sc = SparkContext.getOrCreate("local")
locale = spark._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

property_df = spark.read\
    .format("com.mongodb.spark.sql.DefaultSource")\
    .option("database", "finalproject")\
    .option("collection", "property_db")\
    .load()


In [3]:
# Data

property_df.printSchema()
property_df.show(5)
property_df.count()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)

+--------------------+--------------------+--------------+--------------------+--------------------+
|                 _id|                name|        source|                text|                 url|
+--------------------+--------------------+--------------+--------------------+--------------------+
|[5f24c120d6a00603...|Kristal Garden Re...|Dotproperty.id|"KRISTAL GARDEN R...|https://www.dotpr...|
|[5f24c120d6a00603...|Rumah Minimalis K...|Dotproperty.id|Rumah Minimalis K...|https://www.dotpr...|
|[5f24c120d6a00603...|Ready Stock Rumah...|Dotproperty.id|Ready Stock Rumah...|https://www.dotpr...|
|[5f24c120d6a00603...|KPR Subsidi Progr...|Dotproperty.id|KPR Subsidi Progr...|https://www.dotpr...|
|[5f24c120d6a00603...|Dijual Rumah Mini...|Dotproperty.id|Dijual Ru

95

In [4]:
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF, HashingTF

from pyspark.ml import Pipeline, PipelineModel


In [5]:
# Text Processing

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')

In [6]:
# HashingTF = HashingTF(inputCol="nostopwrd", outputCol="rawFeature" qw  cwkcw  
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")

In [7]:
# Vector data pipline

pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF])
pipeline_mdl = pipeline.fit(property_df)
property_trf_df = pipeline_mdl.transform(property_df)
all_property_vecs = property_trf_df.select('_id', 'idf_vec').rdd.map(lambda x: (x[0], x[1])).collect()


In [8]:
property_trf_df.printSchema()
property_trf_df.select('_id', 'name', 'text', 'url', 'rawFeature', 'nostopwrd', 'idf_vec').show(20)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwrd: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeature: vector (nullable = true)
 |-- idf_vec: vector (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _id|                name|                text|                 url|          rawFeature|           nostopwrd|             idf_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[5f24c120d6a00603...|Kristal Garden Re...|"KRISTAL GARDEN R...|https://ww

In [9]:
all_property_vecs = property_trf_df.select('_id', 'idf_vec').rdd.map(lambda x: (x[0], x[1])).collect() #change Word2Vec

In [10]:
import numpy as np

def cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

In [11]:
from pyspark.sql.functions import col, isnan
from pyspark.ml.evaluation import RegressionEvaluator

def get_property_details(in_property):
    a = in_property.alias("a")
    b = property_df.alias("b")    
    return a.join(b, col("a.property_id") == col("b._id"), 'inner').select([col('a.'+xx) for xx in a.columns] + [col('b.name')]).orderBy("a.score", ascending=False)


In [12]:
# def get_keywords_recomendations(key_words, sim_bus_limit=20):
#     input_words_df = sc.parallelize([(0, key_words)]).toDF(['_id', 'text'])
#     input_words_df = pipeline_mdl.transform(input_words_df)
#     input_key_words_vec = input_words_df.select('idf_vec').collect()[0][0]
#     sim_property_byword_rdd = sc.parallelize((i[0], float(cosine_sim(input_key_words_vec, i[1]))) for i in all_property_vecs)
#     property_rdd = sim_property_byword_rdd.sortBy(lambda a: -a[1]).collect()
#     sim_property_byword_df = spark.createDataFrame(property_rdd) \
#          .withColumnRenamed('_1', 'property_id') \
#          .withColumnRenamed('_2', 'score')\
#          .orderBy("score", ascending=False)
#     result = sim_property_byword_df.filter(
#         (col('score')>0) & (~isnan('score'))
#     ).limit(sim_bus_limit)
#     return get_property_details(result)


In [13]:
# key_words = 'Rumah Minimalis Akses Mudah Subsidi Dekat Jakarta'

# keywords_recom_df = get_keywords_recomendations(key_words, 20)
# keywords_recom_df.show(truncate=False)

In [14]:
# def get_keywords_recom(key_words):
#     input_words_df = sc.parallelize([(0, key_words)]).toDF(['_id', 'text'])
#     input_words_df = pipeline_mdl.transform(input_words_df)
#     input_key_words_vec = input_words_df.select('idf_vec').collect()[0][0]
#     sim_property_byword_rdd = sc.parallelize((i[0], float(cosine_sim(input_key_words_vec, i[1]))) for i in all_property_vecs)
#     property_rdd = sim_property_byword_rdd.sortBy(lambda a: -a[1]).collect()
#     sim_property_byword_df = spark.createDataFrame(property_rdd) \
#          .withColumnRenamed('_1', 'property_id') \
#          .withColumnRenamed('_2', 'score')\
#          .orderBy("score", ascending=False)
#     result = sim_property_byword_df.filter(
#         (col('score')>0.01) & (col('score')<1.1) & (~isnan('score'))
#     )
#     return get_property_details(result)


In [39]:
key_words = 'Rumah Minimalis KPR BTN Bersubsidi Sudah SHM Siap Huni di TangerangRumah KPR Bersubsidi Rasa Komersil, Sudah SHMKeunggulan Perumahan SURYAJAYA :* Perumahanan dibangun diatas lahan 200ha dan sudah dihuni oleh lebih 4.000KK* Fasilitas lengkap di dalam dan disekitar perumahan* Bangunan premium dengan desain rumah modern * DP  angsuran terjangkau * Lokasi strategis dekat ke stasiun /pintu toll/terminal dan juga dikelilingi kawasan kota mandiri* Akses transportasi 24jamAngsuran KPR per bulan :  + 20th = Rp. 1.000.000,- + 15th = Rp. 1.200.000,- + 10th = Rp. 1.600.000,-Lokasi strategis : + 10menit ke stasiun Tigaraksa + 15menit ke pintu toll/terminal + 30menit ke Citra Raya CikupaSegera hubungi : ( telp  whatsapp )0856 8573 888Info akun untuk SuryajayaJangan Lupa Follow IG : rumahsubsidibalarajaSubscribe Youtube : Rumah Subsidi BalarajaFacebook Fanpage : rumahsubsidibalarajaJangan Lupa Kunjungi Website Kami di www.suryaland.idRumah minimalis, desain rumah, desain rumah minimalis, jual rumah, rumah dijual, rumah sederhana, rumahdijual, rumah siap huni tangerang, rumah dijual murah, rumah subsidi kpr, rumah subsidi tangerang, rumah subsidi pemerintah, dp rumah subsidi, rumah murah btn, rumah murah tangerang, rumah kpr btn, kpr btn subsidi, rumah subsidi tangerang kota, rumah murah bank btn, dp rumah subsidi tangerang, annieland, bukit surya, suryajaya, rumah murah, rumah murah jabodetabek, rumah subsidi di jakarta, jual rumah,'

input_words_df = sc.parallelize([(0, key_words)]).toDF(['_id', 'text'])
input_words_df = pipeline_mdl.transform(input_words_df)
input_key_words_vec = input_words_df.select('idf_vec').collect()[0][0]
sim_property_byword_rdd = sc.parallelize((i[0], float(cosine_sim(input_key_words_vec, i[1]))) for i in all_property_vecs)
property_rdd = sim_property_byword_rdd.sortBy(lambda a: -a[1]).collect()
sim_property_byword_df = spark.createDataFrame(property_rdd) \
    .withColumnRenamed('_1', 'property_id') \
    .withColumnRenamed('_2', 'score')\
    .orderBy("score", ascending=False)


In [46]:
input_words_df.show(1000)

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|_id|                text|               token|           nostopwrd|          rawFeature|             idf_vec|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|Rumah Minimalis K...|[rumah, minimalis...|[rumah, minimalis...|(1838,[0,1,2,3,4,...|(1838,[0,1,2,3,4,...|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+



In [47]:
relevan_data = sim_property_byword_df.filter((col('score')>0.1) & (col('score')<1.1) & (~isnan('score'))).count()
relevan_data



25

In [48]:
retrieve_data = sim_property_byword_df.filter((col('score')>0.01) & (col('score')<1.1) & (~isnan('score'))).count()
retrieve_data


89

In [45]:
relevan_data/(relevan_data+retrieve_data)

0.21929824561403508

In [31]:
# def get_test_rmse(key_words, sim_bus_limit=20):
#     input_words_df = sc.parallelize([(0, key_words)]).toDF(['_id', 'text'])
#     input_words_df = pipeline_mdl.transform(input_words_df)
#     input_key_words_vec = input_words_df.select('idf_vec').collect()[0][0]
#     sim_property_byword_rdd = sc.parallelize((i[0], 1.0, float(cosine_sim(input_key_words_vec, i[1]))) for i in all_property_vecs)
#     property_rdd = sim_property_byword_rdd.sortBy(lambda a: -a[1]).collect()
#     sim_property_byword_df = spark.createDataFrame(property_rdd) \
#          .withColumnRenamed('_1', 'property_id') \
#          .withColumnRenamed('_2', 'test_value') \
#          .withColumnRenamed('_3', 'score')\
#          .orderBy("score", ascending=False)
#     result = sim_property_byword_df.filter(
#         (col('score')>0) & (~isnan('score'))
#     ).limit(sim_bus_limit)
#     return get_property_details(result)


In [30]:
# keyword_1 = "Rumah Minimalis Akses Mudah Subsidi Dekat Jakarta Booking Fee 1,5 JtRumah KPR Bersubsidi Rasa Komersil, Sudah SHMKeunggulan Perumahan SURYAJAYA :* Perumahanan dibangun diatas lahan 200ha dan sudah dihuni oleh lebih 4.000KK* Fasilitas lengkap di dalam dan disekitar perumahan* Bangunan premium dengan desain rumah modern * DP  angsuran terjangkau * Lokasi strategis dekat ke stasiun /pintu toll/terminal dan juga dikelilingi kawasan kota mandiri* Akses transportasi 24jamAngsuran KPR per bulan :  + 20th = Rp. 1.000.000,- + 15th = Rp. 1.200.000,- + 10th = Rp. 1.600.000,-Lokasi strategis : + 10menit ke stasiun Tigaraksa + 15menit ke pintu toll/terminal + 30menit ke Citra Raya CikupaSegera hubungi : ( telp  whatsapp )0856 8573 888Info akun untuk SuryajayaJangan Lupa Follow IG : rumahsubsidibalarajaSubscribe Youtube : Rumah Subsidi BalarajaFacebook Fanpage : rumahsubsidibalarajaJangan Lupa Kunjungi Website Kami di www.suryaland.idRumah minimalis, desain rumah, desain rumah minimalis, jual rumah, rumah dijual, rumah sederhana, rumahdijual, rumah siap huni tangerang, rumah dijual murah, rumah subsidi kpr, rumah subsidi tangerang, rumah subsidi pemerintah, dp rumah subsidi, rumah murah btn, rumah murah tangerang, rumah kpr btn, kpr btn subsidi, rumah subsidi tangerang kota, rumah murah bank btn, dp rumah subsidi tangerang, annieland, bukit surya, suryajaya, rumah murah, rumah murah jabodetabek, rumah subsidi di jakarta, jual rumah,"

# keywords_recom_df = get_test_rmse(keyword_1, 10)
# keywords_recom_df.show()
# evaluator = RegressionEvaluator(labelCol="test_value", predictionCol="score", metricName="rmse")
# rmse = evaluator.evaluate(keywords_recom_df)
# print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


In [33]:
# keyword_2 = 'Rumah dekat stasiun tangerang'

# keywords_recom_df = get_test_rmse(keyword_2, 10)
# keywords_recom_df.show(false)
# evaluator = RegressionEvaluator(labelCol="test_value", predictionCol="score", metricName="rmse")
# rmse = evaluator.evaluate(keywords_recom_df)
# print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


In [32]:
# keyword_2 = 'Makanan enak dari padang dan vietnam'

# keywords_recom_df = get_test_rmse(keyword_2, 10)
# keywords_recom_df.show()
# evaluator = RegressionEvaluator(labelCol="test_value", predictionCol="score", metricName="rmse")
# rmse = evaluator.evaluate(keywords_recom_df)
# print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
