In [1]:
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
sid = SentimentIntensityAnalyzer()
from nltk.corpus import stopwords   
from nltk import wordpunct_tokenize
from langdetect import detect

In [12]:
#reviews = pd.read_csv("../data/reviews.csv",encoding='UTF-8')
clean_reviews = pd.read_csv("../data/cleanedReviews_final.csv",encoding='UTF-8')

In [13]:
clean_reviews.size

2160829

## Data preperation for sentiment analysis

We are using **lang detect** Google's language detecion library to detect the language of the reviews

**Note :** the code in below cell is commented as the cleaning is already done and exported as csv

In [14]:
#clean_reviews.comments.apply(lambda x :detect(x) == 'en')

As we have around *200,000* records in our data set we divided the data into partitions to run in batches

In [15]:
##Sentiment analysis on batches.

##For performance reasons, we will split the reviews data frame in ten 
##subsets to perform the sentiment analysis.

partitions=round(len(clean_reviews)*1.0/10,0)
partitions=int(partitions)
adj=int(len(clean_reviews)-partitions*10)

df_reviews_1=clean_reviews[0:partitions]
df_reviews_2=clean_reviews[partitions:partitions*2]
df_reviews_3=clean_reviews[partitions*2:partitions*3]
df_reviews_4=clean_reviews[partitions*3:partitions*4]
df_reviews_5=clean_reviews[partitions*4:partitions*5]
df_reviews_6=clean_reviews[partitions*5:partitions*6]
df_reviews_7=clean_reviews[partitions*6:partitions*7]
df_reviews_8=clean_reviews[partitions*7:partitions*8]
df_reviews_9=clean_reviews[partitions*8:partitions*9]
df_reviews_10=clean_reviews[partitions*9:(partitions*10+adj)]

#Check the splits

df_reviews_1.tail()
df_reviews_2.head()
df_reviews_2.tail()
df_reviews_3.head()

df_reviews_10.tail()
len(clean_reviews)

196439

In [6]:
len(df_reviews_1)

19933

In [None]:
#notnull = df_reviews_1.comments.apply(removeNull)
#df_reviews_1 = df_reviews_1[notnull]

In [6]:
def removeNull(r):
     return pd.notnull(r)

In [7]:
partition_frames =[df_reviews_1, df_reviews_2, df_reviews_3, df_reviews_4, df_reviews_5,
       df_reviews_6, df_reviews_7, df_reviews_8, df_reviews_9, df_reviews_10]

In [62]:
for p in partition_frames:
    notnull = p.comments.apply(removeNull)
    p = p[notnull]

In [16]:
final = pd.concat(partition_frames)

In [30]:
final.head()

Unnamed: 0.1,Unnamed: 0,comments,compound,date,id,listing_id,neg,neu,pos,reviewer_id,reviewer_name
0,68694,Zyka always responded to my queries quickly an...,,7/25/2016,88691663,7540480,,,,38846847,Kate
1,113764,"Zvi's place was lovely. It was clean, newly re...",,5/11/2018,263247884,15016872,,,,37623753,Kimberly
2,113777,Zvi’s place was very clean and comfortable. He...,,8/12/2018,306466916,15016872,,,,61420142,Clare
3,113778,Zvi’s house is really nice. You will have tota...,,8/17/2018,309020118,15016872,,,,108981353,Xuan
4,113771,Zvi´s place is perfect. The description is exa...,,6/23/2018,280592843,15016872,,,,72528712,Diego


In [253]:
clean_reviews.to_csv('../data/cleanedReviews_final.csv')

In [31]:
clean_reviews_final = pd.read_csv("../data/cleanedReviews_final.csv",encoding='UTF-8')

Using NLTK to calculate the sentiment of each review.We calculated the polarity score of each review Using Vader sentiment analyzer.



vader sentiment analyzer handles the stopword removal,special character detection so we need not do it explicity again

In [20]:
def get_sentiment(comments):
    row = []
    for sentence in comments:
        sen = tokenize.sent_tokenize(sentence)
        scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
        for s in sen:
            pol = sid.polarity_scores(s)
            for k in sorted(pol):
                #print(k)
                scores[k] += pol[k]
        for k in scores:
            scores[k] = scores[k]/len(sen)
        row.append(scores)
    return row

In [21]:
rows = get_sentiment(clean_reviews_final.comments)

In [22]:
rows = pd.DataFrame.from_dict(rows)

In [29]:
clean_reviews_final['neg'] = rows['neg']
clean_reviews_final['neu'] = rows['neu']
clean_reviews_final['pos'] = rows['pos']
clean_reviews_final['compound'] = rows['compound']
clean_reviews_final.head()

Unnamed: 0,comments,compound,date,id,listing_id,neg,neu,pos,reviewer_id,reviewer_name
0,Zyka always responded to my queries quickly an...,0.67435,7/25/2016,88691663,7540480,0.0,0.6895,0.3105,38846847,Kate
1,"Zvi's place was lovely. It was clean, newly re...",0.4823,5/11/2018,263247884,15016872,0.0,0.63175,0.36825,37623753,Kimberly
2,Zvi’s place was very clean and comfortable. He...,0.654467,8/12/2018,306466916,15016872,0.0,0.418,0.582,61420142,Clare
3,Zvi’s house is really nice. You will have tota...,0.397933,8/17/2018,309020118,15016872,0.0,0.729667,0.270333,108981353,Xuan
4,Zvi´s place is perfect. The description is exa...,0.4706,6/23/2018,280592843,15016872,0.0,0.634,0.366,72528712,Diego


In [None]:
del clean_reviews_final['Unnamed: 0']

In [32]:
grouped_listings = clean_reviews_final.groupby('listing_id').mean()

In [33]:
grouped_listings

Unnamed: 0_level_0,Unnamed: 0,compound,id,neg,neu,pos,reviewer_id
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3781,116341.076923,0.470737,9.419685e+07,0.014583,0.630566,0.329197,4.390298e+07
5506,88864.734177,0.478391,1.471820e+08,0.009203,0.635004,0.353260,5.267577e+07
6695,81550.455696,0.427414,1.154375e+08,0.013668,0.687838,0.292584,3.835746e+07
6976,86678.876712,0.489799,1.125462e+08,0.015034,0.635784,0.343475,3.321490e+07
8789,100002.600000,0.502091,1.343154e+08,0.006148,0.688655,0.305197,4.251248e+07
8792,121475.500000,0.401463,6.520611e+07,0.007910,0.705026,0.287063,2.129296e+07
9765,79050.555556,0.460258,1.927018e+07,0.000000,0.655230,0.344770,1.762882e+07
9824,96766.826087,0.467256,2.832557e+07,0.023045,0.622323,0.354643,1.024187e+07
9827,64744.875000,0.544579,3.485685e+06,0.002893,0.598286,0.398839,3.077992e+06
9855,101582.666667,0.692861,9.453433e+04,0.000000,0.513333,0.486667,2.051490e+05


In [38]:
listings = pd.read_csv("../data/listings.csv")
listings.rename(columns={'id':'listing_id'},inplace=True)
listings.head()

Unnamed: 0,listing_id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3781,https://www.airbnb.com/rooms/3781,20190209175027,2019-02-09,HARBORSIDE-Walk to subway,Fully separate apartment in a two apartment bu...,This is a totally separate apartment located o...,Fully separate apartment in a two apartment bu...,none,"Mostly quiet ( no loud music, no crowed sidewa...",...,f,f,super_strict_30,f,f,1,1,0,0,0.32
1,5506,https://www.airbnb.com/rooms/5506,20190209175027,2019-02-09,**$79 Special ** Private! Minutes to center!,This is a private guest room with private bath...,**THE BEST Value in BOSTON!!*** PRIVATE GUEST ...,This is a private guest room with private bath...,none,"Peacful, Architecturally interesting, historic...",...,t,f,strict_14_with_grace_period,f,f,6,6,0,0,0.66
2,6695,https://www.airbnb.com/rooms/6695,20190209175027,2019-02-09,$99 Special!! Home Away! Condo,,** WELCOME *** FULL PRIVATE APARTMENT In a His...,** WELCOME *** FULL PRIVATE APARTMENT In a His...,none,"Peaceful, Architecturally interesting, histori...",...,t,f,strict_14_with_grace_period,f,f,6,6,0,0,0.73
3,6976,https://www.airbnb.com/rooms/6976,20190209175027,2019-02-09,Mexican Folk Art Haven in Boston Residential Area,Come stay with me in Boston's Roslindale neigh...,"This is a well-maintained, two-family house bu...",Come stay with me in Boston's Roslindale neigh...,none,The LOCATION: Roslindale is a safe and diverse...,...,f,f,moderate,t,f,1,0,1,0,0.64
4,8789,https://www.airbnb.com/rooms/8789,20190209175027,2019-02-09,Curved Glass Studio/1bd facing Park,"Bright, 1 bed with curved glass windows facing...",Fully Furnished studio with enclosed bedroom. ...,"Bright, 1 bed with curved glass windows facing...",none,Beacon Hill is a historic neighborhood filled ...,...,f,f,strict_14_with_grace_period,f,f,10,10,0,0,0.4


Merged the two data frames and exported to `CSV` , which is used to show the listings on the map in WebBrowser

In [39]:
f = grouped_listings.merge(listings,on='listing_id')

In [40]:
required_cols = ['listing_id','pos','neu','neg','compound','name','latitude','longitude','review_scores_rating','neighbourhood_cleansed']
f = f[required_cols]
f.head()

Unnamed: 0,listing_id,pos,neu,neg,compound,name,latitude,longitude,review_scores_rating,neighbourhood_cleansed
0,3781,0.329197,0.630566,0.014583,0.470737,HARBORSIDE-Walk to subway,42.365241,-71.029361,99.0,East Boston
1,5506,0.35326,0.635004,0.009203,0.478391,**$79 Special ** Private! Minutes to center!,42.329809,-71.095595,95.0,Roxbury
2,6695,0.292584,0.687838,0.013668,0.427414,$99 Special!! Home Away! Condo,42.329941,-71.093505,97.0,Roxbury
3,6976,0.343475,0.635784,0.015034,0.489799,Mexican Folk Art Haven in Boston Residential Area,42.292438,-71.135765,98.0,Roslindale
4,8789,0.305197,0.688655,0.006148,0.502091,Curved Glass Studio/1bd facing Park,42.359187,-71.062651,92.0,Downtown


We took 5 places of interest around boston and we are suggesting the *top 5* listing with **positive_reviews** that precent are in the vincenty of place of interest selected.

We used geopy to calculate the distance from the listings to places of interest
    

In [41]:
from geopy.distance import vincenty

In [42]:
lat_dict = {1: 42.3656, 2: 42.346516, 3: 42.347143, 4: 42.34992,5:42.380327}
lon_dict = {1: -71.0096, 2: -71.08385, 3: -71.082518,4: -71.065556,5:-71.13897}

In [43]:
f['distance_airport'] = f.apply(lambda x: vincenty((x['latitude'], x['longitude']), (lat_dict[1], lon_dict[1])).miles, axis = 1)
f['distance_tophub'] = f.apply(lambda x: vincenty((x['latitude'], x['longitude']), (lat_dict[2], lon_dict[2])).miles, axis = 1)
f['distance_prudential'] = f.apply(lambda x: vincenty((x['latitude'], x['longitude']), (lat_dict[3], lon_dict[3])).miles, axis = 1)
f['distance_royale'] = f.apply(lambda x: vincenty((x['latitude'], x['longitude']), (lat_dict[4], lon_dict[4])).miles, axis = 1)
f['distance_harvard'] = f.apply(lambda x: vincenty((x['latitude'], x['longitude']), (lat_dict[5], lon_dict[5])).miles, axis = 1)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


As we are concentrating on reviews with positive sentiment we are sorting the dataframe on column `pos`

In [45]:
sorted_df = f.sort_values(by=['pos'], ascending=False)

In [46]:
def getPositiveListings(df):
    airport = []
    tophub = []
    prudential = []
    royale = []
    harvard = []
    for i,row in df.iterrows():
        if row['distance_airport'] < 2:
            airport.append(row['listing_id'])
        if row['distance_tophub'] < 2:
            tophub.append(row['listing_id'])
        if row['distance_prudential']< 2:
            prudential.append(row['listing_id'])
        if row['distance_royale'] < 2:
            royale.append(row['listing_id'])
        if row['distance_harvard'] < 2:
            harvard.append(row['listing_id'])
    return airport[:5],tophub[:5],prudential[:5],royale[:5],harvard[:5]

In [47]:
a,b,c,d,e = getPositiveListings(sorted_df)

In [52]:
print("listings near from Logan Airport with postive sentiment : " ,a)
print("listings near from Harvard with postive sentiment : " ,e)
print("listings near from Prudential with postive sentiment : " ,c)
print("listings near from Royale Club with postive sentiment : " ,d)
print("listings near from Top of the Hub with postive sentiment : " ,b)

listings near from Logan Airport with postive sentiment :  [16179346, 31295996, 29728508, 29585373, 29680644]
listings near from Harvard with postive sentiment :  [30505480, 26450789, 29929936, 21505991, 30320050]
listings near from Prudential with postive sentiment :  [22162098, 26362139, 28492660, 31261651, 29053226]
listings near from Royale Club with postive sentiment :  [26362139, 28492660, 31261651, 29053226, 14299184]
listings near from Top of the Hub with postive sentiment :  [22162098, 26362139, 28492660, 31261651, 29053226]


In [54]:
def getNegativeListings(df):
    airport = []
    tophub = []
    prudential = []
    royale = []
    harvard = []
    for i,row in df.iterrows():
        if row['distance_airport'] < 2:
            airport.append(row['listing_id'])
        if row['distance_tophub'] < 2:
            tophub.append(row['listing_id'])
        if row['distance_prudential']< 2:
            prudential.append(row['listing_id'])
        if row['distance_royale'] < 2:
            royale.append(row['listing_id'])
        if row['distance_harvard'] < 2:
            harvard.append(row['listing_id'])
    return airport[:5],tophub[:5],prudential[:5],royale[:5],harvard[:5]

In [56]:
a1,b1,c1,d1,e1 = getNegativeListings(f.sort_values(by=['neg'], ascending=False))
print("listings near from Logan Airport with negative sentiment : " ,a1)
print("listings near from Harvard with negative sentiment : " ,e1)
print("listings near from Prudential with negative sentiment : " ,c1)
print("listings near from Royale Club with negative sentiment : " ,d1)
print("listings near from Top of the Hub with negative sentiment : " ,b1)

listings near from Logan Airport with negative sentiment :  [17607186, 28546674, 11625336, 10098059, 25516679]
listings near from Harvard with negative sentiment :  [30320050, 21679735, 8034092, 13719227, 7527677]
listings near from Prudential with negative sentiment :  [30389976, 27609912, 28492641, 54487, 19173890]
listings near from Royale Club with negative sentiment :  [30389976, 27609912, 28492641, 17607186, 54487]
listings near from Top of the Hub with negative sentiment :  [30389976, 27609912, 28492641, 54487, 19173890]


In [57]:
sorted_df.to_csv('../sentiment_data.csv')

In [58]:
json_export = sorted_df.to_json(orient='records')
json_export

'[{"listing_id":22162098,"pos":1.0,"neu":0.0,"neg":0.0,"compound":0.6588,"name":"Beautiful 2 Bedroom in The heart of Boston!","latitude":42.3322050666,"longitude":-71.1128114605,"review_scores_rating":80.0,"neighbourhood_cleansed":"Mission Hill","distance_airport":5.765145509,"distance_tophub":1.781861021,"distance_prudential":1.8626137083,"distance_royale":2.7111107116,"distance_harvard":3.5812656612},{"listing_id":26362139,"pos":1.0,"neu":0.0,"neg":0.0,"compound":0.4404,"name":"Two Bedroom in Boston\'s Back Bay #201","latitude":42.3467607488,"longitude":-71.0797391682,"review_scores_rating":100.0,"neighbourhood_cleansed":"Back Bay","distance_airport":3.8188335262,"distance_tophub":0.2111544246,"distance_prudential":0.1447031121,"distance_royale":0.7582021855,"distance_harvard":3.8157221988},{"listing_id":28492660,"pos":1.0,"neu":0.0,"neg":0.0,"compound":0.6588,"name":"Classic 2BR in South End by Sonder","latitude":42.3429609231,"longitude":-71.0653139724,"review_scores_rating":100.0,