In [53]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib as plt
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from sklearn import *

In [2]:
calendar = pd.read_csv('calendar.csv.gz', compression='gzip')
listings = pd.read_csv('listings.csv.gz', compression='gzip')
listings_sum = pd.read_csv('listings.csv')
reviews = pd.read_csv('reviews.csv.gz', compression='gzip')
neighborhoods = pd.read_csv('neighbourhoods.csv')

In [72]:
listings['price'] = listings['price'].str[1:].str.replace(',', '').astype(float)

In [61]:
len(listings_sum.head().columns)

16

In [76]:
listings.groupby('room_type')['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Entire home/apt,6913.0,264.074063,368.88942,25.0,114.0,170.0,294.0,10000.0
Hotel room,24.0,133.25,84.020831,0.0,77.0,96.5,240.0,250.0
Private room,1676.0,114.02506,397.678764,10.0,50.0,69.0,99.0,9857.0
Shared room,75.0,111.933333,197.185763,20.0,35.0,40.0,75.0,1000.0


In [92]:
neighborhoods

Unnamed: 0,neighbourhood_group,neighbourhood
0,,Allied Gardens
1,,Alta Vista
2,,Amphitheater And Water Park
3,,Balboa Park
4,,Bario Logan
...,...,...
103,,Village Center
104,,Webster
105,,West University Heights
106,,Wooded Area


In [93]:
reviews

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,29967,62788,2010-07-09,151260,Debbie,When I booked our stay in San Diego at Dennis ...
1,29967,64568,2010-07-14,141552,Eric,This was my first experience with using airbnb...
2,29967,67502,2010-07-22,141591,David,We found the house to be very accommodating--e...
3,29967,70466,2010-07-29,125982,Anders,As advertised and more. Dennis was very helpfu...
4,29967,74876,2010-08-07,29835,Miyoko,We had a great time in San Diego. Denis' house...
...,...,...,...,...,...,...
418381,45879403,700933618,2020-10-16,356204795,Shanna,Jeffrey and his girlfriend were outstanding ho...
418382,45879403,701234991,2020-10-17,57040410,Liz,This place was perfect for me! It was really s...
418383,45879403,701918134,2020-10-18,310817121,Melissa,This place was beautiful! Didn’t have a doubt....
418384,45879403,702188172,2020-10-19,46954062,Steven,Highly recommend this place for anyone wanting...


In [96]:
neighborhoods_gpd = gpd.read_file('neighbourhoods.geojson')

In [97]:
neighborhoods_gpd

Unnamed: 0,neighbourhood,neighbourhood_group,geometry
0,Amphitheater And Water Park,,"MULTIPOLYGON (((-117.01452 32.58790, -117.0144..."
1,Bella Lago,,"MULTIPOLYGON (((-116.94128 32.68431, -116.9412..."
2,Bonita Long Canyon,,"MULTIPOLYGON (((-117.01149 32.66352, -117.0114..."
3,East Lake,,"MULTIPOLYGON (((-116.99302 32.66040, -116.9816..."
4,Eastlake Trails,,"MULTIPOLYGON (((-116.95512 32.64124, -116.9550..."
...,...,...,...
103,University City,,"MULTIPOLYGON (((-117.21464 32.87919, -117.2139..."
104,Valencia Park,,"MULTIPOLYGON (((-117.08555 32.70113, -117.0854..."
105,Webster,,"MULTIPOLYGON (((-117.11058 32.71833, -117.1102..."
106,West University Heights,,"MULTIPOLYGON (((-117.16408 32.76053, -117.1636..."


In [114]:
listings['host_about'][6]

'71 year old gay white Australian male who lives with a member of his chosen family. I train in the area of cultural diversity, and administer stipend programs which recruit graduating masters level students in couples and family therapy for the public mental health system in CA.'

In [120]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_upd

In [129]:
listings['bathrooms_text'].str[0]

791    Private half-bath
Name: bathrooms_text, dtype: object

In [13]:
len(reviews['id'].unique())

418386

In [31]:
check = reviews.groupby('reviewer_id')['listing_id'].count().sort_values(ascending=False)#[:10000].hist(bins=67)
repeat_users = check[check > 1].index

In [50]:
reviews[reviews['reviewer_id'].isin(repeat_users)].comments.value_counts()

Great place                                                                                                                                                                                                                                                                                                                                        122
Great place!                                                                                                                                                                                                                                                                                                                                       100
Nice place                                                                                                                                                                                                                                                                                                                

#### Reference Code

In [54]:
tfidf_vectorizer = TfidfVectorizer()

tokenizer = TreebankWordTokenizer()
tfidf_vectorizer.set_params(tokenizer=tokenizer.tokenize)

# remove English stop words
tfidf_vectorizer.set_params(stop_words='english')

# include 1-grams and 2-grams
tfidf_vectorizer.set_params(ngram_range=(1, 2))

# ignore terms that appear in more than 50% of the documents
tfidf_vectorizer.set_params(max_df=0.5)

# only keep terms that appear in at least 2 documents
tfidf_vectorizer.set_params(min_df=2)

# fit tokenizer
#tfidf_vectorizer.fit(category_train)

TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2), stop_words='english',
                tokenizer=<bound method TreebankWordTokenizer.tokenize of <nltk.tokenize.treebank.TreebankWordTokenizer object at 0x7f83a43190a0>>)

In [None]:
# tfidf_encode = tfidf_vectorizer.transform(category_train)
# tfidf_model = linear_model.LogisticRegression(C=10, fit_intercept=False, max_iter=1000)
# tfidf_model.fit(tfidf_encode, y)
# tfidf_preds = tfidf_model.predict(tfidf_encode)
# tfidf_acc = np.mean(np.array(tfidf_preds) == np.array(y))
# tfidf_acc

In [51]:
# Using cosine similarity to find similarity fo reviews
def cosine(list_1, list_2): 
    # Turn lists into numpy arrays
    a1 = np.array(list_1)
    a2 = np.array(list_2)

    # Cosine similarity function
    numer = np.dot(a1, a2)
    denom = np.sqrt(sum(a1**2)) * np.sqrt(sum(a2**2))
    
    if denom == 0:
        return 0
    
    return numer / denom

In [52]:
def find_most_similar(selected, all_docs, wordsList, wordsIds):
    best_similarity = 0
    most_similar = None
    
    # Get all idf values
    all_idfs = get_idf(all_docs, wordsIds)
    
    # Get selected tf-idf values
    tf_idf_selected = tf_idf(selected, wordsList, wordsIds, all_idfs)
    
    rest_of_docs = all_docs.copy()
    rest_of_docs.remove(chosen_review)
    
    # Iterate through each review, get tf-idf values, and compute similarity
    for review in rest_of_docs:
        tf_idf_compare = tf_idf_encode(review, wordsList, wordsIds, all_idfs)
        similarity = cosine(tf_idf_selected, tf_idf_compare)
        
        # If this similarity is the highest make it the best similarity so far, and save the review
        if similarity > best_similarity:
            best_similarity = similarity
            most_similar = review
            
    # Return review with highest similarity value
    return most_similar