In [1]:
import warnings
import pandas as pd
import sys
import os
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path+"\\python")
from utilities import *
from preprocessing import *

# hide warnings
warnings.filterwarnings("ignore")

# show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

#### Load listings data set and hold specific columns

In [2]:
from IPython.display import display

# columns that we need
columns = [ "id", "name", "description" ]
# load the data set
listings = pd.read_csv("../data_sets/listings.csv", usecols = columns)

# display the data set
display(listings.shape, listings.head(5))

(11393, 3)

Unnamed: 0,id,name,description
0,10595,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...
1,10990,Athens Quality Apartments - Deluxe Apartment,Athens Quality Apartments - Deluxe apartment i...
2,10993,Athens Quality Apartments - Studio,The Studio is an -excellent located -close t...
3,10995,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...
4,27262,"54m2, 1-br, cable tv, wi-fi, metro",Big 1-bedroom apartment that can accommodate 4...


#### Drop all rows that have any nan value

In [3]:
# drop rows
listings = listings.dropna()

# display the data set
display(listings.shape, listings.head(5))

(11211, 3)

Unnamed: 0,id,name,description
0,10595,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...
1,10990,Athens Quality Apartments - Deluxe Apartment,Athens Quality Apartments - Deluxe apartment i...
2,10993,Athens Quality Apartments - Studio,The Studio is an -excellent located -close t...
3,10995,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...
4,27262,"54m2, 1-br, cable tv, wi-fi, metro",Big 1-bedroom apartment that can accommodate 4...


#### Concatenate name and description columns

In [4]:
# concatenate name and description
listings["name_and_description"] = listings["name"] + " " + listings["description"]

# display the data set
display(listings.shape, listings.head(5))

(11211, 4)

Unnamed: 0,id,name,description,name_and_description
0,10595,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...,"96m2, 3BR, 2BA, Metro, WI-FI etc... Athens Fur..."
1,10990,Athens Quality Apartments - Deluxe Apartment,Athens Quality Apartments - Deluxe apartment i...,Athens Quality Apartments - Deluxe Apartment A...
2,10993,Athens Quality Apartments - Studio,The Studio is an -excellent located -close t...,Athens Quality Apartments - Studio The Studio ...
3,10995,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...,"47m2, close to metro,cable TV,wi-fi AQA No2 is..."
4,27262,"54m2, 1-br, cable tv, wi-fi, metro",Big 1-bedroom apartment that can accommodate 4...,"54m2, 1-br, cable tv, wi-fi, metro Big 1-bedro..."


#### Preprocessing

In [5]:
# include more stop words for better results
more_stop_words = ["και","one","athens","house","one","fully","located","area"]

start_time = timer()

# clean our data
listings = clean_stem_lemmatize_tokens_column(listings, "name_and_description", more_stop_words, True)

# display the data set
display(listings.shape, listings.head(5))

timer(start_time)

(11211, 6)

Unnamed: 0,id,name,description,name_and_description,name_and_description_clean_stems_lemmas,name_and_description_clean_stems_lemmas_tokens
0,10595,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...,"96m2, 3BR, 2BA, Metro, WI-FI etc... Athens Fur...",br ba metro wifi etc furnish apart bedroom apa...,"[br, ba, metro, wifi, etc, furnish, apart, bed..."
1,10990,Athens Quality Apartments - Deluxe Apartment,Athens Quality Apartments - Deluxe apartment i...,Athens Quality Apartments - Deluxe Apartment A...,qualiti apart delux apart qualiti apart delux ...,"[qualiti, apart, delux, apart, qualiti, apart,..."
2,10993,Athens Quality Apartments - Studio,The Studio is an -excellent located -close t...,Athens Quality Apartments - Studio The Studio ...,qualiti apart studio studio excel close metro ...,"[qualiti, apart, studio, studio, excel, close,..."
3,10995,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...,"47m2, close to metro,cable TV,wi-fi AQA No2 is...",close metroc tvwifi aqa bedroom apart excel cl...,"[close, metroc, tvwifi, aqa, bedroom, apart, e..."
4,27262,"54m2, 1-br, cable tv, wi-fi, metro",Big 1-bedroom apartment that can accommodate 4...,"54m2, 1-br, cable tv, wi-fi, metro Big 1-bedro...",br cabl tv wifi metro big bedroom apart accomm...,"[br, cabl, tv, wifi, metro, big, bedroom, apar..."


Time spent: 0:1:17


#### TF-IDF (Term Frequency - Inverse Document Frequency)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

start_time = timer()

# calculate tf-idf for unigrams and bigrams
tf_idf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), stop_words="english")
tf_idf = tf_idf_vectorizer.fit_transform(listings["name_and_description_clean_stems_lemmas"])

from sklearn.metrics.pairwise import cosine_similarity

# calculate cosine similarity for tf-idf arrays
cosine = cosine_similarity(tf_idf, tf_idf)
# hold only 2 digits
for index_c, list_values in enumerate(cosine):
    for index_d, dictance in enumerate(list_values):
        cosine[index_c][index_d] = format(dictance, '.2f')
        
timer(start_time)

Time spent: 0:3:26


#### Calculate top 100 most similar airbnbs

In [7]:
start_time = timer()

top_100 = {}

# get id list
id_keys = listings["id"].tolist()
for index, list_values in enumerate(cosine):
    # using map() + lamba associate current id in list
    result = list(map(lambda i: (i, id_keys[index]), id_keys))
    # make sure that we don't push {(ID_A,ID_B): SCORE} AND {(ID_B,ID_A): SCORE}
    for index_r, (id_a, id_b) in enumerate(result):
        if id_a < id_b:
            result[index_r] = (id_b,id_a)
    # update dictionary
    top_100.update(dict(zip(result, list_values)))
    # remove diagnal value: SAME ID
    del top_100[(id_keys[index],id_keys[index])]
    # sort dictionary
    top_100 = sorted(top_100.items(), key=lambda x: x[1],reverse=True)
    # store 100 most matched ids
    top_100 = top_100[:100]
    # list of tuples to dictionary
    top_100 = dict(top_100)

# display top 100 similar id pairs
for index, pair in enumerate(top_100,start=1):
    display(f"{index}: {pair} : {top_100[pair]}")
    
timer(start_time)

'1: (27958734, 203174) : 1.0'

'2: (29786418, 5426236) : 1.0'

'3: (19646670, 6795028) : 1.0'

'4: (25778529, 6795028) : 1.0'

'5: (26617614, 6795028) : 1.0'

'6: (40422876, 6795028) : 1.0'

'7: (33764094, 8594460) : 1.0'

'8: (9548700, 9541663) : 1.0'

'9: (11403364, 11365978) : 1.0'

'10: (11402677, 11366771) : 1.0'

'11: (33840899, 11646666) : 1.0'

'12: (30793059, 11704139) : 1.0'

'13: (32437799, 11716595) : 1.0'

'14: (40910982, 11816352) : 1.0'

'15: (19374457, 13679827) : 1.0'

'16: (15226560, 15033500) : 1.0'

'17: (27774209, 15793210) : 1.0'

'18: (27918481, 18112699) : 1.0'

'19: (28132526, 18112699) : 1.0'

'20: (37487849, 19546148) : 1.0'

'21: (25778529, 19646670) : 1.0'

'22: (26617614, 19646670) : 1.0'

'23: (40422876, 19646670) : 1.0'

'24: (19683305, 19663089) : 1.0'

'25: (35362490, 19663089) : 1.0'

'26: (35432563, 19663089) : 1.0'

'27: (35362490, 19683305) : 1.0'

'28: (35432563, 19683305) : 1.0'

'29: (42019462, 20083273) : 1.0'

'30: (24878335, 20349967) : 1.0'

'31: (31693901, 20383968) : 1.0'

'32: (20692552, 20686418) : 1.0'

'33: (20704187, 20686418) : 1.0'

'34: (20705564, 20686418) : 1.0'

'35: (20706206, 20686418) : 1.0'

'36: (20707116, 20686418) : 1.0'

'37: (20704187, 20692552) : 1.0'

'38: (20705564, 20692552) : 1.0'

'39: (20706206, 20692552) : 1.0'

'40: (20707116, 20692552) : 1.0'

'41: (20705564, 20704187) : 1.0'

'42: (20706206, 20704187) : 1.0'

'43: (20707116, 20704187) : 1.0'

'44: (20706206, 20705564) : 1.0'

'45: (20707116, 20705564) : 1.0'

'46: (20707116, 20706206) : 1.0'

'47: (20708277, 20707394) : 1.0'

'48: (37106154, 22013976) : 1.0'

'49: (22171010, 22030136) : 1.0'

'50: (24201016, 22030136) : 1.0'

'51: (24202003, 22030136) : 1.0'

'52: (37048735, 22047314) : 1.0'

'53: (22074541, 22074163) : 1.0'

'54: (22074946, 22074774) : 1.0'

'55: (22075076, 22074774) : 1.0'

'56: (22075076, 22074946) : 1.0'

'57: (24201549, 22164074) : 1.0'

'58: (24201016, 22171010) : 1.0'

'59: (24202003, 22171010) : 1.0'

'60: (24214048, 22171981) : 1.0'

'61: (24214247, 22173279) : 1.0'

'62: (23554464, 23554220) : 1.0'

'63: (23554584, 23554220) : 1.0'

'64: (23554584, 23554464) : 1.0'

'65: (23943168, 23908316) : 1.0'

'66: (23943627, 23908316) : 1.0'

'67: (23943627, 23943168) : 1.0'

'68: (24202003, 24201016) : 1.0'

'69: (34699357, 24242835) : 1.0'

'70: (35103900, 24243325) : 1.0'

'71: (27448881, 24351998) : 1.0'

'72: (38635466, 24351998) : 1.0'

'73: (35101642, 24387622) : 1.0'

'74: (31066973, 24613722) : 1.0'

'75: (25207888, 25207595) : 1.0'

'76: (26118125, 25515948) : 1.0'

'77: (26118850, 25515948) : 1.0'

'78: (26617614, 25778529) : 1.0'

'79: (40422876, 25778529) : 1.0'

'80: (26118238, 26093644) : 1.0'

'81: (28639109, 26093644) : 1.0'

'82: (28639286, 26093644) : 1.0'

'83: (28715621, 26093644) : 1.0'

'84: (26117905, 26094355) : 1.0'

'85: (26118850, 26118125) : 1.0'

'86: (28639109, 26118238) : 1.0'

'87: (28639286, 26118238) : 1.0'

'88: (28715621, 26118238) : 1.0'

'89: (27282238, 26125014) : 1.0'

'90: (27686070, 26125014) : 1.0'

'91: (26237466, 26214131) : 1.0'

'92: (26237805, 26214131) : 1.0'

'93: (26237805, 26237466) : 1.0'

'94: (26620126, 26383438) : 1.0'

'95: (26402383, 26402249) : 1.0'

'96: (40422876, 26617614) : 1.0'

'97: (27621026, 27226394) : 1.0'

'98: (42421700, 27226394) : 1.0'

'99: (35825418, 27256671) : 1.0'

'100: (27686070, 27282238) : 1.0'

Time spent: 0:2:16


#### Create new column and store 100 most similar airbnbs for one

In [8]:
start_time = timer()

dictionaries = []
# get id list
id_keys = listings["id"].tolist()
for index, list_values in enumerate(cosine):
    # create dictionary
    dictionary = dict(zip(id_keys, list_values))
    # remove dignal value: SAME ID
    del dictionary[id_keys[index]]
    # sort dictionary
    dictionary = sorted(dictionary.items(), key=lambda x: x[1],reverse=True)
    # store first 100 recommendations
    dictionary = dictionary[:100]
    # list of tuples to dictionary
    dictionary = dict(dictionary)
    dictionaries.append(dictionary)

# save dictionaries as new column
listings["dictionary"] = dictionaries
# display the data set
display(listings.shape, listings.head(5))

timer(start_time)

(11211, 7)

Unnamed: 0,id,name,description,name_and_description,name_and_description_clean_stems_lemmas,name_and_description_clean_stems_lemmas_tokens,dictionary
0,10595,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...,"96m2, 3BR, 2BA, Metro, WI-FI etc... Athens Fur...",br ba metro wifi etc furnish apart bedroom apa...,"[br, ba, metro, wifi, etc, furnish, apart, bed...","{10990: 0.66, 10995: 0.63, 10993: 0.43, 265509..."
1,10990,Athens Quality Apartments - Deluxe Apartment,Athens Quality Apartments - Deluxe apartment i...,Athens Quality Apartments - Deluxe Apartment A...,qualiti apart delux apart qualiti apart delux ...,"[qualiti, apart, delux, apart, qualiti, apart,...","{10995: 0.76, 10595: 0.66, 10993: 0.55, 265509..."
2,10993,Athens Quality Apartments - Studio,The Studio is an -excellent located -close t...,Athens Quality Apartments - Studio The Studio ...,qualiti apart studio studio excel close metro ...,"[qualiti, apart, studio, studio, excel, close,...","{10995: 0.57, 10990: 0.55, 10595: 0.43, 265509..."
3,10995,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...,"47m2, close to metro,cable TV,wi-fi AQA No2 is...",close metroc tvwifi aqa bedroom apart excel cl...,"[close, metroc, tvwifi, aqa, bedroom, apart, e...","{10990: 0.76, 10595: 0.63, 10993: 0.57, 265509..."
4,27262,"54m2, 1-br, cable tv, wi-fi, metro",Big 1-bedroom apartment that can accommodate 4...,"54m2, 1-br, cable tv, wi-fi, metro Big 1-bedro...",br cabl tv wifi metro big bedroom apart accomm...,"[br, cabl, tv, wifi, metro, big, bedroom, apar...","{2655090: 0.25, 35401022: 0.13, 13820241: 0.12..."


Time spent: 0:1:3


#### Recommend function

We stored above 100 most similar airbnbs for each one. Now we can use our recommend function and accomplish linear time.

In [13]:
from itertools import islice

"""
recommend: returns N most similar airbnbs
                        
arguments:
    dataframe                : pandas dataframe
    item_id                  : int
    number_of_recommendations: int
"""
def recommend(dataframe, item_id, number_of_recommendations):
    # get row by id
    row = dataframe.loc[dataframe["id"] == item_id]
    # check if id exists in dataframe
    if len(row.index) == 0:
        display(f"Id '{item_id}' not found")
        return
    # get dictionary
    dictionary = row.iloc[0]["dictionary"]
    # get N first recommendations
    recommendations = list(islice(dictionary.items(), number_of_recommendations))
    print(f"Recommending {number_of_recommendations} listings similar to {item_id}")
    print("---------------------------------------------------------")
    # print all recommendations
    for l_id, score in recommendations:
        # get row by id
        row = dataframe.loc[dataframe["id"] == l_id]
        print(f"Id: {row.iloc[0]['id']}")
        print(f"Recommended: {row.iloc[0]['name']}")
        print(f"Description: {row.iloc[0]['description']}")
        print(f"(score: {score})\n")

# test recommend() function
recommend(listings,27958734,1)
recommend(listings,27686070,2)

Recommending 1 listings similar to 27958734
---------------------------------------------------------
Id: 203174
Recommended: Athen downtown New Private Bedroom 42
Description: This is a room in apartment where I don't live :)) 1. The room: This bedroom has a double large bed, 2 nightstand, mirror, TV, studio, large closet, access to balcony,TV,heater,WI-FI access (high). Wash and dry (5 euro) 2 . Location: - buss station, metro station(Omonoia and Metaxourgeio), you are conected with all areas in Athens.  - Central train station ( Larisa ), is 10/15 min. walking destination :  From Larisa station you can also take a train to all the major cities of Greece (Thessaloniki) as well as some cities outside the country, such as Bulgaria, Turkey, you can use ( InterRail).  - grocery store, bakery, super-market, greek traditional restaurants, tavernas, fast-food,cafes, banks and pharmacies .  - Acropolis-20 min walking  - Thissio - 15 min walking  - Monastiraki (the ancient market), 10 min wal

#### 10 words that often appear together

In [10]:
import nltk
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder

tokens = []
# concatenate all tokens
for index, row in listings.iterrows():
    for token in listings.loc[index, "name_and_description_clean_stems_lemmas_tokens"]:
        tokens.append(token)
# bigram measurement
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
# find 10 words that often appear together
scored = finder.nbest(bigram_measures.likelihood_ratio, 10)
# display 10 most frequenctly pair of words
for index,(word_a, word_b) in enumerate(scored,start=1):
    display(f"{index}: {word_a} {word_b}")

'1: metro station'

'2: live room'

'3: walk distanc'

'4: doubl bed'

'5: minut walk'

'6: air condit'

'7: hidden airbnb'

'8: wash machin'

'9: equip kitchen'

'10: brand new'