In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Spark Configuration

spark = SparkSession \
    .builder\
    .master('local')\
    .config('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/propertify')\
    .config('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/propertify')\
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.1')\
    .getOrCreate()

sc = SparkContext.getOrCreate("local")
locale = spark._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

property_df = spark.read\
    .format("com.mongodb.spark.sql.DefaultSource")\
    .option("database", "finalproject")\
    .option("collection", "property_db")\
    .load()


In [2]:
# Data

property_df.printSchema()
property_df.show(5)
property_df.count()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)

+--------------------+--------------------+--------------+--------------------+--------------------+
|                 _id|                name|        source|                text|                 url|
+--------------------+--------------------+--------------+--------------------+--------------------+
|[5f1eacd2fa1b0b4d...|Kristal Garden Re...|Dotproperty.id|"KRISTAL GARDEN R...|https://www.dotpr...|
|[5f1eacd2fa1b0b4d...|Ready Stock Rumah...|Dotproperty.id|Ready Stock Rumah...|https://www.dotpr...|
|[5f1eacd2fa1b0b4d...|Rumah 1 Lantai Di...|Dotproperty.id|Rumah 1 Lantai Di...|https://www.dotpr...|
|[5f1eacd2fa1b0b4d...|Rumah Bersubsidi ...|Dotproperty.id|Rumah Bersubsidi ...|https://www.dotpr...|
|[5f1eacd2fa1b0b4d...|KPR Subsidi Progr...|Dotproperty.id|KPR Subsi

103

In [3]:
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF, HashingTF

from pyspark.ml import Pipeline, PipelineModel


In [4]:
# Text Processing

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')

In [5]:
# HashingTF = HashingTF(inputCol="nostopwrd", outputCol="rawFeature" qw  cwkcw  
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")

In [6]:
# Vector data pipline

pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF])
pipeline_mdl = pipeline.fit(property_df)
property_trf_df = pipeline_mdl.transform(property_df)
all_property_vecs = property_trf_df.select('_id', 'idf_vec').rdd.map(lambda x: (x[0], x[1])).collect()


In [7]:
property_trf_df.printSchema()
property_trf_df.select('_id', 'name', 'text', 'url', 'rawFeature', 'nostopwrd', 'idf_vec').show(20)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwrd: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeature: vector (nullable = true)
 |-- idf_vec: vector (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _id|                name|                text|                 url|          rawFeature|           nostopwrd|             idf_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[5f1eacd2fa1b0b4d...|Kristal Garden Re...|"KRISTAL GARDEN R...|https://ww

In [8]:
all_property_vecs = property_trf_df.select('_id', 'idf_vec').rdd.map(lambda x: (x[0], x[1])).collect() #change Word2Vec

In [9]:
import numpy as np

def cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

In [10]:
from pyspark.sql.functions import col, isnan

def get_property_details(in_property):
    a = in_property.alias("a")
    b = property_df.alias("b")    
    return a.join(b, col("a.property_id") == col("b._id"), 'inner').select([col('a.'+xx) for xx in a.columns] + [col('b.name'), col('b.url'), col('b.text'), col('b.source')]).orderBy("a.score", ascending=False)


In [11]:
def get_keywords_recomendations(key_words, sim_bus_limit=20):
    input_words_df = sc.parallelize([(0, key_words)]).toDF(['_id', 'text'])
    input_words_df = pipeline_mdl.transform(input_words_df)
    input_key_words_vec = input_words_df.select('idf_vec').collect()[0][0]
    sim_property_byword_rdd = sc.parallelize((i[0], float(cosine_sim(input_key_words_vec, i[1]))) for i in all_property_vecs)
    property_rdd = sim_property_byword_rdd.sortBy(lambda a: -a[1]).collect()
    sim_property_byword_df = spark.createDataFrame(property_rdd) \
         .withColumnRenamed('_1', 'property_id') \
         .withColumnRenamed('_2', 'score')\
         .orderBy("score", ascending=False)
    result = sim_property_byword_df.filter(
        (col('score')>0) & (~isnan('score'))
    ).limit(sim_bus_limit)
    return get_property_details(result)


In [12]:
key_words = 'rumah murah jabodetabek, rumah subsidi di jakarta, jual rumah'

keywords_recom_df = get_keywords_recomendations(key_words, 20)
keywords_recom_df.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+--------------+
|         property_id|              score|                name|                 url|                text|        source|
+--------------------+-------------------+--------------------+--------------------+--------------------+--------------+
|[5f1eacd2fa1b0b4d...|0.41394174001564177|Rumah Minimalis A...|https://www.dotpr...|Rumah Minimalis A...|Dotproperty.id|
|[5f1eacd2fa1b0b4d...|0.39631826668140296|Rumah Minimalis K...|https://www.dotpr...|Rumah Minimalis K...|Dotproperty.id|
|[5f1eacd2fa1b0b4d...|  0.379660914972938|Rumah Subsidi Bar...|https://www.dotpr...|Rumah Subsidi Bar...|Dotproperty.id|
|[5f1eacd2fa1b0b4d...| 0.3754294459000138|Rumah Subsidi Dij...|https://www.dotpr...|Rumah Subsidi Dij...|Dotproperty.id|
|[5f1eacd2fa1b0b4d...| 0.3592444049983601|Rumah Tangerang B...|https://www.dotpr...|Rumah Tangerang B...|Dotproperty.id|
|[5f1eacd2fa1b0b4d...|0.35858178

In [25]:
import json

keywords_recom_df.toJSON().map(lambda j: json.loads(j)).collect()

[{'property_id': {'oid': '5f1eacd2fa1b0b4dcefce116'},
  'score': 0.41394174001564177,
  'name': 'Rumah Minimalis Akses Mudah Subsidi Dekat Jakarta Booking Fee 1,5 Jt',
  'url': 'https://www.dotproperty.id/rumah-dijual-dengan-2-kamar-tidur-di-balaraja-banten_4716159',
  'text': 'Rumah Minimalis Akses Mudah Subsidi Dekat Jakarta Booking Fee 1,5 JtRumah KPR Bersubsidi Rasa Komersil, Sudah SHMKeunggulan Perumahan SURYAJAYA :* Perumahanan dibangun diatas lahan 200ha dan sudah dihuni oleh lebih 4.000KK* Fasilitas lengkap di dalam dan disekitar perumahan* Bangunan premium dengan desain rumah modern * DP  angsuran terjangkau * Lokasi strategis dekat ke stasiun /pintu toll/terminal dan juga dikelilingi kawasan kota mandiri* Akses transportasi 24jamAngsuran KPR per bulan : \xa0+ 20th = Rp. 1.000.000,-\xa0+ 15th = Rp. 1.200.000,-\xa0+ 10th = Rp. 1.600.000,-Lokasi strategis :\xa0+ 10menit ke stasiun Tigaraksa\xa0+ 15menit ke pintu toll/terminal\xa0+ 30menit ke Citra Raya CikupaSegera hubungi : ( t