In [1]:
import pandas as pd
import numpy as np

#re and string will be used to process the data
import re 
import string

#Import all the dependencies
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

#Just to get rid of warnings
import warnings
warnings.filterwarnings("ignore")

listingsDF = pd.read_csv('listings.csv')

In [2]:

#Instead of dropping we select what we need
listingsDF = listingsDF[['id', 'name','description', 
                         'host_id', 'host_name', 'property_type', 'price', 
                         'number_of_reviews', 'review_scores_rating']]

#We drop all rows with empty cells
listingsDF = listingsDF.dropna(axis=0, how='any')

#Remove the '$' from price
listingsDF.price = listingsDF.price.str.replace('[$]', '')

#Remove the ',' from price
listingsDF.price = listingsDF.price.str.replace(',', '')

#Convert price from object to float
listingsDF['price'] = listingsDF['price'].astype(float)

#listingsDF.head()
listingsDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3171 entries, 0 to 3814
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3171 non-null   int64  
 1   name                  3171 non-null   object 
 2   description           3171 non-null   object 
 3   host_id               3171 non-null   int64  
 4   host_name             3171 non-null   object 
 5   property_type         3171 non-null   object 
 6   price                 3171 non-null   float64
 7   number_of_reviews     3171 non-null   int64  
 8   review_scores_rating  3171 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 247.7+ KB


In [3]:
aDF = listingsDF[['description']]

In [4]:
# pd.set_option('display.max_colwidth', -1)

# def clean(text):
#     text = text.lower()
#     text = re.sub('[^\w\s]', '', text) #This removes all the punctuations
#     text = re.sub(r'\n',' ', text) #This relaces the \n with space
#     text = re.sub(r'\r','', text) #\r
#     text = re.sub('[^0-9a-z #+_]', '', text) #Special chars
    
#     return text

# aDF['description'] = aDF['description'].apply(clean)

In [5]:
desclist = list(aDF['description'])

In [6]:
desclist

["Make your self at home in this charming one-bedroom apartment, centrally-located on the west side of Queen Anne hill.   This elegantly-decorated, completely private apartment (bottom unit of a duplex) has an open floor plan, bamboo floors, a fully equipped kitchen, a TV,  DVD player, basic cable, and a very cozy bedroom with a queen-size bed. The unit sleeps up to four (two in the bedroom and two on the very comfortable fold out couch, linens included) and includes free WiFi and laundry. The apartment opens onto a private deck, complete with it's own BBQ, overlooking a garden and a forest of black bamboo.    The Apartment is perfectly-located just one block from the bus lines where you can catch a bus and be downtown Seattle in fifteen minutes or historic Ballard in ten or a quick five-minute walk will bring you to Whole Foods and Peet's Coffee or take a fifteen minute walk to the top of Queen Anne Hill where you will find a variety of eclectic shops, bars, and restaurants. There is 

In [7]:
train_corpus = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(desclist)]

In [8]:
train_corpus

[TaggedDocument(words=['make', 'your', 'self', 'at', 'home', 'in', 'this', 'charming', 'one-bedroom', 'apartment', ',', 'centrally-located', 'on', 'the', 'west', 'side', 'of', 'queen', 'anne', 'hill', '.', 'this', 'elegantly-decorated', ',', 'completely', 'private', 'apartment', '(', 'bottom', 'unit', 'of', 'a', 'duplex', ')', 'has', 'an', 'open', 'floor', 'plan', ',', 'bamboo', 'floors', ',', 'a', 'fully', 'equipped', 'kitchen', ',', 'a', 'tv', ',', 'dvd', 'player', ',', 'basic', 'cable', ',', 'and', 'a', 'very', 'cozy', 'bedroom', 'with', 'a', 'queen-size', 'bed', '.', 'the', 'unit', 'sleeps', 'up', 'to', 'four', '(', 'two', 'in', 'the', 'bedroom', 'and', 'two', 'on', 'the', 'very', 'comfortable', 'fold', 'out', 'couch', ',', 'linens', 'included', ')', 'and', 'includes', 'free', 'wifi', 'and', 'laundry', '.', 'the', 'apartment', 'opens', 'onto', 'a', 'private', 'deck', ',', 'complete', 'with', 'it', "'s", 'own', 'bbq', ',', 'overlooking', 'a', 'garden', 'and', 'a', 'forest', 'of', 'b

In [9]:
max_epochs = 40
vec_size = 30
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(train_corpus)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(train_corpus,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
Model Saved


In [10]:
input = word_tokenize("I want a cozy and huge room!".lower())
vector = model.infer_vector(input)
print(vector)

[ 0.01786365  0.2624814   0.19895165 -0.05529816 -0.02050978  0.03395355
  0.15671216  0.09599808  0.14127961 -0.04771169 -0.2778186  -0.19706813
 -0.04218343  0.06677701  0.16353922  0.07310545 -0.06578258 -0.3126985
 -0.0978848   0.11284652 -0.21961755  0.1383706   0.04500597 -0.14690374
 -0.10172186  0.05375253 -0.19541974  0.060777   -0.06109541 -0.07206152]


In [11]:
for doc_id in range(len(train_corpus)):
    inferred_vector = vector
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))


In [12]:
sims

[('1353', 0.7811964154243469),
 ('1354', 0.7598116993904114),
 ('475', 0.7045113444328308),
 ('682', 0.6955971717834473),
 ('172', 0.6932136416435242),
 ('2732', 0.6922073364257812),
 ('744', 0.6861609220504761),
 ('2540', 0.6839118599891663),
 ('2235', 0.6783003211021423),
 ('3004', 0.673907458782196),
 ('2626', 0.663776159286499),
 ('1617', 0.661430835723877),
 ('1182', 0.6596360206604004),
 ('3153', 0.658632218837738),
 ('720', 0.6583782434463501),
 ('2226', 0.6571862101554871),
 ('87', 0.6549673676490784),
 ('201', 0.6540331244468689),
 ('3124', 0.6514738202095032),
 ('2634', 0.6509668827056885),
 ('143', 0.6484715938568115),
 ('1944', 0.6474770307540894),
 ('863', 0.6473982334136963),
 ('3081', 0.6466875076293945),
 ('1804', 0.6457762718200684),
 ('3069', 0.643828272819519),
 ('2978', 0.6437361836433411),
 ('179', 0.6431601047515869),
 ('2369', 0.6428776979446411),
 ('734', 0.6424492597579956),
 ('649', 0.64178067445755),
 ('2927', 0.6398178935050964),
 ('956', 0.6365967392921448)

In [13]:
for index in range(0,5):
    print(sims[index], ' '.join(train_corpus[int(sims[index][0])].words))
    print('\n')

('1353', 0.7811964154243469) you will have the whole place to yourself ! it is centrally located in capitol hill near first hill within walking distance of downtown and pike 's place market .


('1354', 0.7598116993904114) this beautiful space could not be in a more convenient location ! sandwiched between downtown and capitol hill , it is walkable to pike place market , the ferry and light rail stations , both stadiums as well as plenty of restaurant/bar/retail spots !


('475', 0.7045113444328308) see space details below . our condo has gone through updates based on past guest experiences . similar to my other listing..but this one is for my entire place . amazing location in a comfortable and classy home . 2 bedrooms and 2 baths.. couch can be used for the 5th person . the couch is so comfortable that other guests prefer to sleep on the couch than my other two beds . main bedroom has a queen with a top of the line mattress - better than temperpedic . 2nd bedroom has a very comfy ful