## Dependencies

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [109]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [110]:

#https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
#     pprint([(X.text, X.label_) for X in doc.ents])

def english(sent):
    try:
        return len(re.findall('[A-z]',sent))/len(sent) > 0.5
    except:
        return False 
def tokenize(comment,nlp):  
    doc = nlp(str(comment))
    return doc

def entity(comment):
    comment = [(X.text, X.label_) for X in comment.ents]
    return comment
def keyword(sent):
    #https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
    out = [] 
    sent = sent.split()
    sent = nltk.pos_tag(sent)
    for word in sent:
        if word[0] in ['perfect','although','recommend','great','nice','near','close','old','noisy','convenient',]:
            out.append(word[0])
        try:
            word[1].index('JJ')
            out.append(word[0].lower())
        except:
            try:
                word[1].index('RB')
                out.append(word[0].lower())
            except:
                pass
    return out

def word(comment):
    return [ X for X in comment]

def stopword(sent,stop_list):
    sent = word_tokenize(sent)
    out = []
    for w in sent:
        if w not in stop_list and len(w) >2:
            out.append(str(w))
    return out
def clean(sent,lemma):
    out = []
    sent = nltk.pos_tag(sent)
    for word in sent:
        try:
            word[1].index('V')
            out.append(lemma.lemmatize(word[0].lower(),'v'))
        except:
            out.append(lemma.lemmatize(word[0].lower()))
    return " ".join(out)


In [112]:
review = pd.read_csv('./Data/reviews.csv',nrows=4000)

In [113]:
lemmatizer = WordNetLemmatizer()
stop_list = set(stopwords.words('english'))
nlp = en_core_web_sm.load()
print(review.shape)
print(review.columns)

review['eng'] = review['comments'].apply(english)
review['comments'] = review['comments'][review['eng'] == True].apply(stopword, args=(stop_list,))
review['comments'] = review['comments'][review['eng'] == True].apply(clean, args=(lemmatizer,))
review['comment_token'] = review['comments'][review['eng'] == True].apply(tokenize, args=(nlp,))
review['keyword'] = review['comments'][review['eng']== True].apply(keyword)
review['entities'] = review['comment_token'][review['eng'] == True].apply(entity)
# review['words'] = review['comment_token'][review['eng'] == True].apply(word)

(4000, 6)
Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')


In [38]:
review[review['keyword'].isna()].head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,eng,comment_token,keyword,entities,negCount
57,69074,121083577,2016-12-13,46336404,沉沉,,False,,,,0.0
61,69074,129739011,2017-02-01,105815180,Cecilia,,False,,,,0.0
70,69074,170223813,2017-07-15,110099295,Jichu,,False,,,,0.0
71,69074,171567632,2017-07-18,69885276,Yilia,,False,,,,0.0
72,69074,174454984,2017-07-26,110252333,刘昱钦,,False,,,,0.0


In [2]:
from gensim import models

w2v = models.KeyedVectors.load_word2vec_format('../GoogleNews.bin',binary=True)

array([ 0.04052734,  0.0625    , -0.01745605,  0.07861328,  0.03271484,
       -0.01263428,  0.00964355,  0.12353516, -0.02148438,  0.15234375,
       -0.05834961, -0.10644531,  0.02124023,  0.13574219, -0.13183594,
        0.17675781,  0.27148438,  0.13769531, -0.17382812, -0.14160156,
       -0.03076172,  0.19628906, -0.03295898,  0.125     ,  0.25390625,
        0.12695312, -0.15234375,  0.03198242,  0.01135254, -0.01361084,
       -0.12890625,  0.01019287,  0.23925781, -0.08447266,  0.140625  ,
        0.13085938, -0.04516602,  0.06494141,  0.02539062,  0.05615234,
        0.24609375, -0.20507812,  0.23632812, -0.00860596, -0.02294922,
        0.05078125,  0.10644531, -0.03564453,  0.08740234, -0.05712891,
        0.08496094,  0.23535156, -0.10107422, -0.03564453, -0.04736328,
        0.04736328, -0.14550781, -0.10986328,  0.14746094, -0.23242188,
       -0.07275391,  0.19628906, -0.37890625, -0.07226562,  0.04833984,
        0.11914062,  0.06103516, -0.12109375, -0.27929688,  0.05

In [151]:
def similarity(w1,w2):
    return sum(w1*w2) / sum(w1 *w2)**0.5/ sum(w2*w2)**0.5 

out = []
out2 = []
out3 = []
out4 = [] 
index =[]

bad = w2v['bad']
small = w2v['small']
dirty = w2v['dirty']
unfriend = w2v['unfriendly']
far = w2v['far']
inconven = w2v['inconvenient']
disappoint = w2v['disappointed']

good = w2v['good']
clean = w2v['clean']
friend = w2v['friendly']
close = (w2v['close']+w2v['near'])/2
conven = w2v['convenient']
recommend = w2v['recommend']
response = w2v['responsive']
comfortable = w2v['comfortable']

false_pos = ['quite','really','much','dirty','right','little','there','back','bad','enough','only','inconvenient','very','not','outside']
skip = ['quite','much','larger','clean', 'good', 'awesome', 'clean', 'large','big','happy', 'friendly','hospitable', 'friendliest','huge','close','convenient','recommend'] 
for i,words in review['keyword'][(review['eng'] == True)  & (review['keyword'].notna())].iteritems():
    negword = [] 
    posword = []
    index.append(i)
    for word in words:
        try:
            vec = w2v[word]
            if word in false_pos:
                pass
            elif word in ['cozy','modern','brilliant'] or similarity(good,vec) > 0.6 or similarity(clean,vec) > 0.6 or similarity(friend,vec) > 0.6 or similarity(close,vec) > 0.6 or similarity(conven,vec) > 0.6 or similarity(recommend,vec) > 0.6 or  similarity(response,vec) > 0.6 or  similarity(comfortable,vec) > 0.6:
                posword.append(word)
            if word in skip:
                continue
        except:
            continue
        if word in ['however','although','noisy','old'] or similarity(vec, bad) > 0.7 or similarity(vec,small) > 0.7 or similarity(vec, dirty) > 0.7 or similarity(vec,unfriend) > 0.7 or similarity(vec,far) > 0.7 or similarity(vec,inconven) > 0.7 or similarity(disappoint,vec) > 0.7:
            negword.append(word)
            count += 1
    posword = [word for word in posword if word not in negword]
    negword = [word for word in negword if word not in posword]
    
    out4.append(posword)
    out2.append(negword)
    out.append(len(negword))
    if ('perfect' in posword) or ('best' in posword): 
        out3.append(100)
    else:
        out3.append(len(posword))
out = pd.Series(out, index=index)
out2 = pd.Series(out2,index=index)
out3 = pd.Series(out3,index=index)
out4 = pd.Series(out4,index=index)

print(out.shape)
print(out2.shape)
print(out3.shape)

review['negCount'] = pd.Series([None] * review.shape[0])
review['negWord'] = pd.Series([None] * review.shape[0])
review['posCount'] =  pd.Series([None] * review.shape[0])
review['posWord'] =  pd.Series([None] * review.shape[0])

review['negCount'].iloc[index] = out
review['negWord'].iloc[index] = out2
review['posCount'].iloc[index] = out3
review['posWord'].iloc[index] = out4

review[['posCount','negWord','negCount','comments']][(review['eng'] == True) & (review['negCount'].notna())& (review['negCount'] > 0)].sample(10)

  


(3423,)
(3423,)
(3423,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,posCount,negWord,negCount,comments
171,6,"[old, old, smaller]",3,veronica hospitable enough wait late check-in ...
1387,12,[small],1,great location close busy soho quieter neighbo...
1702,9,[smallish],1,convenient smallish apartment fantastic locati...
1606,15,"[noisy, noisy, small]",3,wanchai definitely one convenient spot hong ko...
2874,8,"[old, old]",2,patrick kind helpful host always responses qui...
156,2,[smaller],1,great location walking distance many place dim...
2674,6,[small],1,overall great place stay hong kong want stay c...
186,100,[small],1,perfect location great price wonderful service...
1517,8,[small],1,great apartment experience host problem get bu...
425,4,"[smaller, sorry]",2,friend satisfied apartment though little small...


In [152]:
review['sentiment'] = ((review['negCount']*2 - review['posCount'] < 1) | (len(review['negWord']) == 0))
review[['sentiment','negWord','posWord','comments']][(review['sentiment'] == False) & (review['comments'].notna())].sample(5)

Unnamed: 0,sentiment,negWord,posWord,comments
412,False,"[old, old, far, old, old, dirty, small]","[allow, recommend, actually, nice, nice, unfor...",beware false advertising address brendan allow...
3340,False,"[smaller, small, however]","[great, great, big, comfortable, definitely]",family adult kid great stay apartment course a...
873,False,"[unpleasant, scary]",[],unpleasant experience while n't know exactly c...
3644,False,"[old, old, smaller, small]","[clean, unique]",the building old apartment clean cosy the room...
916,False,"[small, small, small]","[close, close]",the bedroom really really really small standar...


In [153]:
review.to_csv('./1013.csv')

In [100]:
similarity(w2v['comfortable'],w2v['cozy'])

0.5806707739438449

## Listing.csv

In [10]:
listing = pd.read_csv('./Data/listings.csv')
listing['neighbourhood'].value_counts()

Yau Tsim Mong        4764
Central & Western    2628
Wan Chai             2315
Islands               547
Kowloon City          465
Eastern               402
Sham Shui Po          261
Yuen Long             246
North                 232
Sai Kung              169
Sha Tin               123
Southern              113
Kwun Tong              68
Tsuen Wan              63
Tai Po                 55
Tuen Mun               54
Kwai Tsing             39
Wong Tai Sin           25
Name: neighbourhood, dtype: int64

In [11]:
listing.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,69074,Beautiful oasis of plants & art @ best location,160139,Amy,,Central & Western,22.28352,114.15018,Entire home/apt,1408,3,130,2019-05-02,1.27,1,159
1,101765,Lamma Island flat 2min walk from the beach!,532909,Michael,,Islands,22.20022,114.13461,Entire home/apt,430,2,11,2019-06-09,0.71,1,47
2,103760,Central Centre 5 min walk to/from Central MTR,304876,Brend,,Central & Western,22.28407,114.1557,Entire home/apt,853,2,260,2019-07-04,2.65,12,329
3,132773,Fabulous 2 Bdrm Aprt Open kitchen,304876,Brend,,Central & Western,22.28868,114.14494,Entire home/apt,1056,2,259,2019-07-03,2.66,12,308
4,133390,"Soho, Hong Kong 1 bedroom flat",654642,Robin,,Central & Western,22.28343,114.15539,Entire home/apt,939,2,27,2015-06-30,0.28,1,91


In [117]:
from sklearn.cluster import KMeans

def cluster_num(loc,km):
    '''
    k mean cluster the neighborhood based on geographical location
    '''
    return km.predict([loc])[0]


def build_kmean(cluster,X,fig):
    plt.figure(2,figsize=(12,10))
    km = KMeans(
        n_clusters=cluster, init='random',
        n_init=10, max_iter=300, 
        tol=1e-04, random_state=0
    )
    X = pd.DataFrame(X,columns=['x','y'])
    y_km = km.fit_predict(X)
    plt.figure(fig)
    for i in range(cluster):
        plt.scatter(
            X['x'][y_km == i], X['y'][y_km == i],
            s=50,
            marker='s', edgecolor='black',
            label='cluster '+ str(i)
        )
    return km
def neighbour(listing_id):
    try:
        tmp = listing['neighbourhood'][listing['id'] == listing_id].values.tolist()
        if len(tmp) == 1:
            return tmp[0]
        else:
            print(tmp)
            return "Nan"
    except:
        return "Nan"

def listing_loc(id_):
    '''
    append column on review
        -the location of the listing 
    '''
    return id2loc[id_]

In [119]:
global listing
listing = pd.read_csv('./Data/listings.csv')
print(listing.columns)
print(listing.shape)
listing.head(10)
global id2loc
id2loc = listing[['latitude','longitude','id']].values
id2loc = {z:[x,y] for x,y,z in id2loc }
X = pd.DataFrame([id2loc[x] for x in id2loc],columns=['x','y'])

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')
(12569, 16)


In [118]:
# find optimal number of k
distortions = []
for i in range(1, 11):
    km = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=0
    )
    km.fit(X)
    distortions.append(km.inertia_)

# plot distortion graph
plt.figure(99,figsize=(12,10))
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
cluster = 4

NameError: name 'X' is not defined

In [None]:
review['cluster'] = [-1] * review.shape[0]
print(review.columns)
for i,n in enumerate(['Yau Tsim Mong','Central & Western','Wan Chai']):
    X = review['loc'][review['neighbourhood'] == n ].values
    X = [[x[0],x[1]] for x in X]
    kmean = build_kmean(cluster,X,10+i)
    review['cluster'][review['neighbourhood'] == n ] = review['loc'][review['neighbourhood'] == n ].apply(cluster_num,args=(kmean,))
review[review['neighbourhood'] == 'Wan Chai'].sample(20)

In [12]:
len(review['neighbourhood'].unique())

18

### tf-idf (extract keyword of subcluster)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
import json

def embedding(sent,transformer,cv):
    tf_idf_vector=transformer.transform(cv.transform([sent]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    return sorted_items

def keyword(vector,feature_names):
    keywords=extract_topn_from_vector(feature_names,vector,5)
    return keywords

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    #create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results

In [30]:
docs = review['keyword'][review['eng'] == True].values.tolist()
docs = [" ".join(x) for x in docs]
cv=CountVectorizer(max_df=0.85,stop_words=stop_list, max_features=10000)
word_count_vector=cv.fit_transform(docs)
print(list(cv.vocabulary_.keys())[:10])
feature_names=cv.get_feature_names()

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
neighbor_keyword = {}
for p in list(range(cluster))+[-1]:
    for q in review['neighbourhood'].unique():
        comment = " ".join([str(x) for x in review['keyword'][(review['cluster'] == p) & (review['neighbourhood'] == q) & (review['eng'] ==True)].values.tolist()])
        neighbor_keyword[str(p)+"-"+str(q)] = keyword(embedding(comment,tfidf_transformer,cv),feature_names)

for key in neighbor_keyword:
    if len(neighbor_keyword[key]) > 0:
        print(key,neighbor_keyword[key])
listing_keyword = {}
for listing in review['listing_id'][review['eng'] == True].unique():
        comment = " ".join([str(x) for x in review['keyword'][(review['listing_id'] == listing) & (review['eng'] ==True)].values.tolist()])
        listing_keyword[str(listing)] = keyword(embedding(comment,tfidf_transformer,cv),feature_names)
with open("neighbor_keyword.json", 'w') as json_file:
    json.dump(neighbor_keyword, json_file)
with open("listing_keyword.json", 'w') as json_file:
    json.dump(listing_keyword, json_file)

review.sample(20)


['lovely', 'apartment', 'great', 'quickly', 'always', 'much', 'pleasant', 'apt', 'anonymous', 'also']
0-Central & Western {'great': 0.372, 'nice': 0.262, 'good': 0.244, 'clean': 0.229, 'flat': 0.221}
0-Yau Tsim Mong {'good': 0.33, 'great': 0.309, 'clean': 0.289, 'small': 0.246, 'nice': 0.216}
0-Wan Chai {'great': 0.341, 'good': 0.29, 'clean': 0.269, 'small': 0.206, 'really': 0.19}
1-Central & Western {'great': 0.408, 'good': 0.218, 'clean': 0.195, 'nice': 0.192, 'central': 0.188}
1-Yau Tsim Mong {'good': 0.334, 'great': 0.283, 'clean': 0.273, 'nice': 0.236, 'also': 0.222}
1-Wan Chai {'great': 0.365, 'good': 0.278, 'clean': 0.233, 'nice': 0.215, 'really': 0.198}
2-Central & Western {'great': 0.401, 'good': 0.23, 'clean': 0.219, 'nice': 0.218, 'well': 0.201}
2-Yau Tsim Mong {'good': 0.324, 'great': 0.299, 'clean': 0.286, 'small': 0.248, 'also': 0.214}
2-Wan Chai {'great': 0.347, 'nice': 0.239, 'good': 0.229, 'clean': 0.204, 'really': 0.204}
3-Central & Western {'great': 0.358, 'nice': 0.

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,eng,comment_token,keyword,entities,words,loc,neighbourhood,cluster
228006,25923064,367507978,2019-01-05,130822762,可可,,False,,,,,"[22.32262, 114.16795]",Yau Tsim Mong,1
177755,19830017,459925045,2019-05-28,72103688,Xinyi,Location great near Tsim Sha Tsui station The ...,True,"(Location, great, near, Tsim, Sha, Tsui, stati...","[great, small, clean, tidy, towel, really, fri...","[(Tsim Sha Tsui, FAC)]","[Location, great, near, Tsim, Sha, Tsui, stati...","[22.29658, 114.17338000000001]",Yau Tsim Mong,0
32314,3525683,31030467,2015-05-01,7604796,Emma,Wilson amaze host responsive question helpful ...,True,"(Wilson, amaze, host, responsive, question, he...","[responsive, flat, nice, nice, portable, wifi,...","[(Wilson, ORG), (Tin Hau, PERSON), (Hong Kong,...","[Wilson, amaze, host, responsive, question, he...","[22.28458, 114.1921]",Wan Chai,2
189597,21037375,263031820,2018-05-11,72267721,伟力,,False,,,,,"[22.31567, 114.17078000000001]",Yau Tsim Mong,3
243486,29264367,347809361,2018-11-12,223678528,Ming Hua,,False,,,,,"[22.296110000000002, 114.1726]",Yau Tsim Mong,0
9020,754511,3587912,2013-02-19,4989170,Harrison,Such great place The decor amenities cozy cute...,True,"(Such, great, place, The, decor, amenities, co...","[such, great, extremely, well, outfitted, show...","[(Bathroom, NORP)]","[Such, great, place, The, decor, amenities, co...","[22.27926, 114.19207]",Wan Chai,2
237287,27917900,428431624,2019-03-25,49833321,Reiko,,False,,,,,"[22.28472, 114.13668999999999]",Central & Western,3
104942,10913704,104273315,2016-09-25,47391151,Christy,Nice modern apartment amaze view Staff helpful...,True,"(Nice, modern, apartment, amaze, view, Staff, ...","[modern, amaze, view, friendly, stay]",[],"[Nice, modern, apartment, amaze, view, Staff, ...","[22.308770000000003, 114.18332]",Kowloon City,-1
98576,10187545,120980461,2016-12-12,36366054,Sergei,,False,,,,,"[22.31849, 114.17164]",Yau Tsim Mong,3
208079,23046977,261750425,2018-05-07,3248595,Kai,Fantastic location Great value Would recommend...,True,"(Fantastic, location, Great, value, Would, rec...",[fantastic],[],"[Fantastic, location, Great, value, Would, rec...","[22.28152, 114.15167]",Central & Western,1


## Check similarity of review sentiment for same listing

#### Note that tensorflow version must be 1.1

In [None]:
!pip uninstall tensorflow

In [None]:
!pip install tensorflow==1.1.0

In [None]:
import gensim.downloader as api 

def embed(sent, model):
    exception = 0 
    out = 0
    for i,word in enumerate(sent):
        if 1:
            if i == 0 : 
                out = model[word]
            else:
                out += model[word]
        else:
            print(word,"not in list")
            exception+=1
            pass
    
    return out/(len(sent)-exception)

model = api.load('word2vec-google-news-300')
review['embedding'] = review['cleaned_comment'].apply(embed, args=(model,))


In [None]:
from sklearn.cluster import KMeans, MiniBatchMeans

kmeans = MiniBatchMeans(n_clusters=6)
commentK6 = kmeans.fit_predict(review['embedding'])
review['cluster'] = pd.DataFrame(commentK6)

review['comment'][review['cluster']== 1].sample(20)

In [None]:
listing = pd.read_csv('./listings.csv')
print(listing.shape)
print(listing.head(5))

plt.figure(1)
plt.scatter(listing['availability_365'], listing['reviews_per_month'])

# abnormal in the sense that too many reviews compared to availability
abnormal = listing['id'][(listing['availability_365'] < 50) & (listing['reviews_per_month'] > 10)].tolist()
abnormal_host = listing['host_id'][(listing['availability_365'] < 50) & (listing['reviews_per_month'] > 10)].tolist()
print(abnormal, abnormal_host)

In [None]:
review['comments'][review['listing_id'].isin(abnormal)]

In [None]:
# abnormal in the sense that too less reviews compared to availability
abnormal = listing['id'][(listing['availability_365'] > 300) & (listing['reviews_per_month'] < 3)].tolist()
abnormal_host = listing['host_id'][(listing['availability_365'] > 300) & (listing['reviews_per_month'] < 3)].tolist()

review['comments'][review['listing_id'].isin(abnormal)].sample(20)

In [None]:
calendar = pd.read_csv('./calendar.csv')
calendar.head(10)