In [8]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Spark Configuration

spark = SparkSession \
    .builder\
    .master('local')\
    .config('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/propertify')\
    .config('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/propertify')\
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.1')\
    .getOrCreate()

sc = SparkContext.getOrCreate("local")
locale = spark._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))

property_df = spark.read\
    .format("com.mongodb.spark.sql.DefaultSource")\
    .option("database", "finalproject")\
    .option("collection", "property")\
    .load()


In [9]:
# Data

property_df.printSchema()
property_df.show(5)
property_df.count()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)

+--------------------+--------------------+--------------+--------------------+--------------------+
|                 _id|                name|        source|                text|                 url|
+--------------------+--------------------+--------------+--------------------+--------------------+
|[5f03bf652a115591...|Kristal Garden Re...|Dotproperty.id|"KRISTAL GARDEN R...|https://www.dotpr...|
|[5f03bf662a115591...|Tanah hook, 132 M...|Dotproperty.id|* DIJUAL CEPAT *T...|https://www.dotpr...|
|[5f03bf662a115591...|Rumah Harga Murah...|Dotproperty.id|Rumah Harga Murah...|https://www.dotpr...|
|[5f03bf662a115591...|Rumah Sederhana S...|Dotproperty.id|Rumah Sederhana S...|https://www.dotpr...|
|[5f03bf662a115591...|Termurah! Rumah M...|Dotproperty.id|Termurah!

815

In [10]:
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF, HashingTF

from pyspark.ml import Pipeline, PipelineModel


In [11]:
# Text Processing

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')

In [12]:
# HashingTF = HashingTF(inputCol="nostopwrd", outputCol="rawFeature" qw  cwkcw  
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")

In [13]:
# Vector data pipline

pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF])
pipeline_mdl = pipeline.fit(property_df)
property_trf_df = pipeline_mdl.transform(property_df)
all_property_vecs = property_trf_df.select('_id', 'idf_vec').rdd.map(lambda x: (x[0], x[1])).collect()


In [15]:
property_trf_df.printSchema()
property_trf_df.select('_id', 'name', 'text', 'url', 'idf_vec').show(20)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- source: string (nullable = true)
 |-- text: string (nullable = true)
 |-- url: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- nostopwrd: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeature: vector (nullable = true)
 |-- idf_vec: vector (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 _id|                name|                text|                 url|             idf_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[5f03bf652a115591...|Kristal Garden Re...|"KRISTAL GARDEN R...|https://www.dotpr...|(6450,[0,1,2,3,4,...|
|[5f03bf662a115591...|Tanah hook, 132 M...|* DIJUAL CEPAT *T...|https://www.dotpr...|(6450,[1

In [16]:
all_property_vecs = property_trf_df.select('_id', 'idf_vec').rdd.map(lambda x: (x[0], x[1])).collect() #change Word2Vec

In [17]:
import numpy as np

def cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

In [18]:
from pyspark.sql.functions import col, isnan

def get_property_details(in_property):
    a = in_property.alias("a")
    b = property_df.alias("b")    
    return a.join(b, col("a.property_id") == col("b._id"), 'inner').select([col('a.'+xx) for xx in a.columns] + [col('b.name'), col('b.url'), col('b.text'), col('b.source')]).orderBy("a.score", ascending=False)


In [19]:
def get_keywords_recomendations(key_words, sim_bus_limit=20):
    input_words_df = sc.parallelize([(0, key_words)]).toDF(['_id', 'text'])
    input_words_df = pipeline_mdl.transform(input_words_df)
    input_key_words_vec = input_words_df.select('idf_vec').collect()[0][0]
    sim_property_byword_rdd = sc.parallelize((i[0], float(cosine_sim(input_key_words_vec, i[1]))) for i in all_property_vecs)
    property_rdd = sim_property_byword_rdd.sortBy(lambda a: -a[1]).collect()
    sim_property_byword_df = spark.createDataFrame(property_rdd) \
         .withColumnRenamed('_1', 'property_id') \
         .withColumnRenamed('_2', 'score')\
         .orderBy("score", ascending=False)
    result = sim_property_byword_df.filter(
        (col('score')>0) & (~isnan('score'))
    ).limit(sim_bus_limit)
    return get_property_details(result)


In [20]:
key_words = 'jaga karsa'

keywords_recom_df = get_keywords_recomendations(key_words, 20)
keywords_recom_df.show()

+--------------------+------------------+--------------------+--------------------+--------------------+---------+
|         property_id|             score|                name|                 url|                text|   source|
+--------------------+------------------+--------------------+--------------------+--------------------+---------+
|[5f03bf83921cfeeb...|0.6428073309373508|Rumah dan tanah d...|https://www.rumah...|Tanah dan rumah d...|Rumah.com|
+--------------------+------------------+--------------------+--------------------+--------------------+---------+



In [21]:
import json

keywords_recom_df.toJSON().map(lambda j: json.loads(j)).collect()

[{'property_id': {'oid': '5f03bf83921cfeebe0669027'},
  'score': 0.6428073309373508,
  'name': 'Rumah dan tanah dijual di Jaga karsa sangat bagus strategis',
  'url': 'https://www.rumah.com/listing-properti/dijual-rumah-dan-tanah-dijual-di-jaga-karsa-sangat-bagus-strategis-oleh-irham-thoha-16612979',
  'text': 'Tanah dan rumah daerah Jaga Karsa sangat strategis\n                                    Dijual Tanah dan Bangunan di daerah sangat strategis di Jl. Kahfi Cipedak Jaga Karsa Jakarta selatan deket tol kearah Antasari , Jabodetabek serta bandara Soeta deket kearah Thamrin dan Sudirman ada Bus way dan MRT serta LRT',
  'source': 'Rumah.com'}]