In [1]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

In [2]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'

In [3]:
import timeit
import numpy as np

In [4]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.5'
os.environ["PYSPARK_DRIVER_PYTHON"] = 'ipython'
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = 'notebook'

In [5]:
import pyspark
from pyspark import sql, SparkConf, SparkContext
from pyspark.sql.functions import lit
from pyspark.sql.session import SparkSession
from pyspark.serializers import Serializer

In [6]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import col, udf, isnan, when, count
from pyspark.sql.types import *

In [7]:
from sklearn.neighbors import LSHForest, NearestNeighbors
from spark_sklearn import GridSearchCV

In [8]:
import psutil

In [9]:
cpu_count = psutil.cpu_count()
av_memory = psutil.virtual_memory().total / (1024.0 ** 3)
memory = str(int(0.9 * av_memory)) + 'G'
number_of_instances = 1

In [10]:
cpu_count, memory

(4, '21G')

In [11]:
conftfos = SparkConf().setAll([('spark.executor.instances',number_of_instances),('spark.executor.cores', cpu_count),('spark.executor.memory', memory),('spark.dynamicAllocation.enabled','False')])

sc = SparkContext(master='local', appName='Barcelona',conf=conftfos)
spark = SparkSession(sc)

In [12]:
BarcelonaFILE = 'Barcelona.csv'
AmsterdamFILE = 'Amsterdam.csv'

In [13]:
df_Barcelona_listings = spark.read.option("header","true").csv(BarcelonaFILE)
df_Barcelona_listings.withColumn('City', lit('Barcelona'))
df_Amsterdam_listings = spark.read.option("header","true").csv(AmsterdamFILE)
df_Amsterdam_listings.withColumn('City', lit('Amsterdam'))

DataFrame[id: string, name: string, host_id: string, host_name: string, neighbourhood_group: string, neighbourhood: string, latitude: string, longitude: string, room_type: string, price: string, minimum_nights: string, number_of_reviews: string, last_review: string, reviews_per_month: string, calculated_host_listings_count: string, availability_365: string, City: string]

In [14]:
train_size = 1000

In [15]:
df_Barcelona_listings = df_Barcelona_listings.limit(train_size)
df_Amsterdam_listings = df_Amsterdam_listings.limit(train_size)

In [16]:
#df_listings = df_Amsterdam_listings.union(df_Barcelona_listings)

In [17]:
df_Barcelona_listings.head()

Row(id='18653', name='Modern apartment well connected', host_id='71615', host_name='Mireia And Maria', neighbourhood_group='Eixample', neighbourhood='la Sagrada Família', latitude='41.40921459338917', longitude='2.1738675526453535', room_type='Entire home/apt', price='151', minimum_nights='2', number_of_reviews='0', last_review=None, reviews_per_month=None, calculated_host_listings_count='30', availability_365='78')

In [18]:
df_Amsterdam_listings.head()

Row(id='20621335', name='Clean room Amsterdam. Metro 3min walk.Free parking', host_id='25403329', host_name='Victor', neighbourhood_group=None, neighbourhood='Bijlmer-Oost', latitude='52.319172968245226', longitude='4.981150531499213', room_type='Private room', price='52', minimum_nights='3', number_of_reviews='23', last_review='2017-11-28', reviews_per_month='6.83', calculated_host_listings_count='1', availability_365='12')

In [19]:
listings_names_Ams = df_Amsterdam_listings.select(col('name')).fillna('')
listings_names_Bar = df_Barcelona_listings.select(col('name')).fillna('')

In [20]:
listings_names_Ams.show()

+--------------------+
|                name|
+--------------------+
|Clean room Amster...|
|Sunny and cozy ro...|
|Pop B&B-private r...|
|Tastefully furnis...|
|Cozy room in the ...|
|Great room south ...|
|Lovely room in So...|
|spacious light ap...|
|Amsterdam South S...|
|B&B# green oasis ...|
|King  bedroom nea...|
|Spacious room in ...|
|Wow! Laid back to...|
|Amsterdam thrive ...|
|Large bedroom 15 ...|
|Comfortable  room...|
|3 rooms in green ...|
|Studio Amsterdam ...|
|Nice room 15 min ...|
|Nice appartment /...|
+--------------------+
only showing top 20 rows



In [21]:
listings_names_Bar.show()

+--------------------+
|                name|
+--------------------+
|Modern apartment ...|
|Flat with Sunny T...|
|Huge flat for 8 p...|
|Great Place in Sa...|
|Double room in Sp...|
|    Cozy single room|
|Nice and sunny du...|
|Comf. double room...|
|Room for 2,  Sagr...|
|Room for 2-3. Bar...|
|Aribau - in love ...|
|Gaudi Penthouse -...|
|Duplex with panor...|
|Sagrada Familia a...|
|Awesome Gallery a...|
|VIDRE HOME PLAZA ...|
|Double 02 Casanov...|
|Single 01 Casanov...|
|Single 02 Casanov...|
|Double 01 Casanov...|
+--------------------+
only showing top 20 rows



In [22]:
tokenizer = Tokenizer(inputCol="name", outputCol="words")
tokenized_Ams = tokenizer.transform(listings_names_Ams)
tokenized_Bar = tokenizer.transform(listings_names_Bar)

In [23]:
tokenized_Ams.show()

+--------------------+--------------------+
|                name|               words|
+--------------------+--------------------+
|Clean room Amster...|[clean, room, ams...|
|Sunny and cozy ro...|[sunny, and, cozy...|
|Pop B&B-private r...|[pop, b&b-private...|
|Tastefully furnis...|[tastefully, furn...|
|Cozy room in the ...|[cozy, room, in, ...|
|Great room south ...|[great, room, sou...|
|Lovely room in So...|[lovely, room, in...|
|spacious light ap...|[spacious, light,...|
|Amsterdam South S...|[amsterdam, south...|
|B&B# green oasis ...|[b&b#, green, oas...|
|King  bedroom nea...|[king, , bedroom,...|
|Spacious room in ...|[spacious, room, ...|
|Wow! Laid back to...|[wow!, laid, back...|
|Amsterdam thrive ...|[amsterdam, thriv...|
|Large bedroom 15 ...|[large, bedroom, ...|
|Comfortable  room...|[comfortable, , r...|
|3 rooms in green ...|[3, rooms, in, gr...|
|Studio Amsterdam ...|[studio, amsterda...|
|Nice room 15 min ...|[nice, room, 15, ...|
|Nice appartment /...|[nice, app

In [24]:
tokenized_Bar.show()

+--------------------+--------------------+
|                name|               words|
+--------------------+--------------------+
|Modern apartment ...|[modern, apartmen...|
|Flat with Sunny T...|[flat, with, sunn...|
|Huge flat for 8 p...|[huge, flat, for,...|
|Great Place in Sa...|[great, place, in...|
|Double room in Sp...|[double, room, in...|
|    Cozy single room|[cozy, single, room]|
|Nice and sunny du...|[nice, and, sunny...|
|Comf. double room...|[comf., double, r...|
|Room for 2,  Sagr...|[room, for, 2,, ,...|
|Room for 2-3. Bar...|[room, for, 2-3.,...|
|Aribau - in love ...|[aribau, -, in, l...|
|Gaudi Penthouse -...|[gaudi, penthouse...|
|Duplex with panor...|[duplex, with, pa...|
|Sagrada Familia a...|[sagrada, familia...|
|Awesome Gallery a...|[awesome, gallery...|
|VIDRE HOME PLAZA ...|[vidre, home, pla...|
|Double 02 Casanov...|[double, 02, casa...|
|Single 01 Casanov...|[single, 01, casa...|
|Single 02 Casanov...|[single, 02, casa...|
|Double 01 Casanov...|[double, 0

In [25]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
removed_Ams = remover.transform(tokenized_Ams)
removed_Bar = remover.transform(tokenized_Bar)

In [26]:
removed_Ams.show()

+--------------------+--------------------+--------------------+
|                name|               words|            filtered|
+--------------------+--------------------+--------------------+
|Clean room Amster...|[clean, room, ams...|[clean, room, ams...|
|Sunny and cozy ro...|[sunny, and, cozy...|[sunny, cozy, roo...|
|Pop B&B-private r...|[pop, b&b-private...|[pop, b&b-private...|
|Tastefully furnis...|[tastefully, furn...|[tastefully, furn...|
|Cozy room in the ...|[cozy, room, in, ...|[cozy, room, se, ...|
|Great room south ...|[great, room, sou...|[great, room, sou...|
|Lovely room in So...|[lovely, room, in...|[lovely, room, so...|
|spacious light ap...|[spacious, light,...|[spacious, light,...|
|Amsterdam South S...|[amsterdam, south...|[amsterdam, south...|
|B&B# green oasis ...|[b&b#, green, oas...|[b&b#, green, oas...|
|King  bedroom nea...|[king, , bedroom,...|[king, , bedroom,...|
|Spacious room in ...|[spacious, room, ...|[spacious, room, ...|
|Wow! Laid back to...|[wo

In [27]:
removed_Bar.show

<bound method DataFrame.show of DataFrame[name: string, words: array<string>, filtered: array<string>]>

In [28]:
numFeatures = 50

In [29]:
hashingTF = HashingTF(inputCol='filtered', outputCol='rawFeatures', numFeatures=numFeatures)
Ams_listings = hashingTF.transform(removed_Ams)
Bar_listings = hashingTF.transform(removed_Bar)
idf = IDF(inputCol='rawFeatures', outputCol='VectorSpace')
idfModel_Ams = idf.fit(Ams_listings)
idfModel_Bar = idf.fit(Bar_listings)
tfidf_Ams = idfModel_Ams.transform(Ams_listings)
tfidf_Bar = idfModel_Bar.transform(Bar_listings)

In [30]:
tfidf_Ams.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                name|               words|            filtered|         rawFeatures|         VectorSpace|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Clean room Amster...|[clean, room, ams...|[clean, room, ams...|(50,[7,18,33,40,4...|(50,[7,18,33,40,4...|
|Sunny and cozy ro...|[sunny, and, cozy...|[sunny, cozy, roo...|(50,[1,18,20,44,4...|(50,[1,18,20,44,4...|
|Pop B&B-private r...|[pop, b&b-private...|[pop, b&b-private...|(50,[11,19,25,37]...|(50,[11,19,25,37]...|
|Tastefully furnis...|[tastefully, furn...|[tastefully, furn...|(50,[8,21,28,46],...|(50,[8,21,28,46],...|
|Cozy room in the ...|[cozy, room, in, ...|[cozy, room, se, ...|(50,[1,23,28,44],...|(50,[1,23,28,44],...|
|Great room south ...|[great, room, sou...|[great, room, sou...|(50,[10,18,44,49]...|(50,[10,18,44,49]...|
|Lovely room in So...|[lovely, room, 

In [31]:
tfidf_Bar.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                name|               words|            filtered|         rawFeatures|         VectorSpace|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Modern apartment ...|[modern, apartmen...|[modern, apartmen...|(50,[0,7,22,27],[...|(50,[0,7,22,27],[...|
|Flat with Sunny T...|[flat, with, sunn...|[flat, sunny, ter...|(50,[18,28],[2.0,...|(50,[18,28],[3.29...|
|Huge flat for 8 p...|[huge, flat, for,...|[huge, flat, 8, p...|(50,[0,18,20,26,2...|(50,[0,18,20,26,2...|
|Great Place in Sa...|[great, place, in...|[great, place, sa...|(50,[10,23,29,41,...|(50,[10,23,29,41,...|
|Double room in Sp...|[double, room, in...|[double, room, sp...|(50,[0,12,18,20,2...|(50,[0,12,18,20,2...|
|    Cozy single room|[cozy, single, room]|[cozy, single, room]|(50,[1,31,44],[1....|(50,[1,31,44],[2....|
|Nice and sunny du...|[nice, and, sun

In [32]:
#tfidf_lim = tfidf.limit(train_size)

In [33]:
data_Ams = np.asarray(tfidf_Ams.select('VectorSpace').collect())
data_Bar = np.asarray(tfidf_Bar.select('VectorSpace').collect())

In [34]:
data_Ams = data_Ams.reshape((data_Ams.shape[0], data_Ams.shape[2]))
data_Bar = data_Bar.reshape((data_Bar.shape[0], data_Bar.shape[2]))

In [35]:
data_Ams.shape
data_Bar.shape

(1000, 50)

In [36]:
def scorer(estimator, X):
    estimator.fit(X)
    n_neighbors = estimator.get_params()['n_neighbors']
    neighbors_approx = estimator.kneighbors(X, return_distance=False)
    
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='cosine').fit(X)
    neighbors_exact = nbrs.kneighbors(X, return_distance=False)
    
    accuracy = []
    for i in range(len(neighbors_approx)):
        accuracy.append(np.sum(np.equal(neighbors_approx[i], neighbors_exact[i]))/n_neighbors)
    
    result = np.mean(accuracy)
    return result

In [37]:
param_grid = {
    'n_estimators':[20,30,40],
    'min_hash_match':[3,4,5],
    'n_candidates':[50,75,100],
    'n_neighbors': [3],
}
lshf = LSHForest(random_state=7)
grid_search = GridSearchCV(sc=sc, estimator=lshf, param_grid=param_grid, scoring=scorer)



In [38]:
%%time
grid_search.fit(data_Bar)



CPU times: user 116 ms, sys: 64 ms, total: 180 ms
Wall time: 7min 19s




GridSearchCV(cv=None, error_score='raise',
       estimator=LSHForest(min_hash_match=4, n_candidates=50, n_estimators=10, n_neighbors=5,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=7),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_candidates': [50, 75, 100], 'n_estimators': [20, 30, 40], 'min_hash_match': [3, 4, 5], 'n_neighbors': [3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       sc=<SparkContext master=local appName=Barcelona>,
       scoring=<function scorer at 0x7f5e51b91bf8>, verbose=0)

In [39]:
best_estimator = grid_search.best_estimator_

In [40]:
best_estimator

LSHForest(min_hash_match=4, n_candidates=75, n_estimators=40, n_neighbors=3,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=7)

In [41]:
try:
    grid_search.best_score_
except Exception as e:
    print(e)
    print("Sad. But true :(")

'GridSearchCV' object has no attribute 'best_score_'
Sad. But true :(


In [42]:
best_score = scorer(best_estimator, data_Bar)

In [43]:
best_score

0.9836666666666667