Libraries

In [43]:
import json
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
 

Load data

In [44]:
#Documents
#tip_path = "/Users/Andy/Desktop/yelp_academic_dataset_tip.json"
tip_path="../../yelp_dataset/yelp_academic_dataset_tip.json"
tip_doc=open(tip_path,encoding="utf8").readlines()

business_path="../../yelp_dataset/yelp_academic_dataset_business.json"
business_doc=open(business_path,encoding="utf8").readlines()



#Dataframes
busList = []
for bus in business_doc:
    busList.append(json.loads(bus))

dfBus = pd.DataFrame.from_records(busList)

tipList = []
for tip in tip_doc:
    tipList.append(json.loads(tip))

df_Bus = pd.DataFrame.from_records(busList)
df_Tip = pd.DataFrame.from_records(tipList)

df_comb = pd.merge(df_Bus, df_Tip, on='business_id', how='outer')
df_comb=df_comb[df_comb['text'].notna()] # filter on empty tips

## Task 1

#### Task 1a

In [45]:
#TextBlob(df_comb.iloc[2]["text"]).sentiment
#map TextBlob(df_comb.iloc[i]["text"]).sentiment and append as column

df_comb['sentiment'] = df_comb[['text']].applymap(lambda tip: TextBlob(tip).sentiment.polarity)

In [46]:
df_comb.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,user_id,text,date,compliment_count,sentiment
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",CZ517SU7uQDc3Aa17SN6Bw,Their prices are enormously inflated. I was ch...,2013-02-17 22:29:53,0.0,0.055
1,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",M0DM77NUD5_w4EHuH2vZNg,Go to Eagle Guns or Point Blank Range instead,2013-09-26 21:43:54,0.0,0.0
2,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",46HhzhpBfTdTSB5ceTx_Og,Friendly staff!,2012-05-28 18:57:21,0.0,0.46875


#### Task 1b

In [47]:
df_sentiment_by_state = df_comb.groupby("state",as_index=False)['sentiment'].mean()


In [48]:
print("Mean Sentiment by State")
print("")

#filter out candadian provinces
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

df_sentiment_by_USstate = df_sentiment_by_state[df_sentiment_by_state['state'].isin(states)]
print(df_sentiment_by_USstate)

Mean Sentiment by State

   state  sentiment
1     AL   0.177778
2     AR   0.400205
3     AZ   0.315210
5     CA   0.176626
6     CO   0.092188
7     CT   0.302083
8     HI   0.107143
9     IL   0.297415
11    MI   0.400000
12    MO   0.450000
13    NC   0.304907
14    NE   0.282143
15    NV   0.282558
16    NY   0.430124
17    OH   0.306991
19    PA   0.282545
21    SC   0.321160
22    TX   0.322010
23    VA   0.200000
24    VT  -0.450000
25    WA   0.062731
26    WI   0.287927


#### Task 1c

In [49]:
import plotly
import plotly.graph_objects as go
#plotly.offline.plot(fig, filename='map.html')


In [50]:
fig=go.Figure(data=go.Choropleth(
    locations=df_sentiment_by_USstate["state"],
    z=df_sentiment_by_USstate['sentiment'].astype(float),
    locationmode='USA-states',
    colorscale='Bluered',
    colorbar_title="Sentiment",
)
)

fig.update_layout(
    title_text='Mean Yelp Tip Seniment Analisis',
    geo_scope='usa'
)
fig.show()

In [56]:
plotly.offline.plot(fig,filename='map.html')

'map.html'

## Task 2

#### Task 2a

In [51]:
df_AZ = df_Bus[df_Bus["state"]=="AZ"]
df_AZ=df_AZ[df_AZ['categories'].notna()]

df_AZ['is_restaurant'] = df_AZ[['categories']].applymap(lambda cat_list: "Restaurants" in cat_list)

df_tip_restaurant=pd.merge(df_AZ,df_Tip, on='business_id',how='left')
df_tip_restaurant=df_tip_restaurant[df_tip_restaurant['text'].notna()] # filter on empty tips

df_tip_restaurant=df_tip_restaurant[['text','is_restaurant']]



In [52]:
df_tip_restaurant.head(3)

Unnamed: 0,text,is_restaurant
2,Awesome service! In and out in less time than ...,False
3,"Great place to go for honest, good old fashion...",False
4,Wait 4 hours to fix an ac and charged me $559 ...,False


#### Task 2b

In [53]:
#generate feature matrix
corpus = df_tip_restaurant['text']
vectorizer=TfidfVectorizer(stop_words='english')
X=vectorizer.fit_transform(corpus)
y=df_tip_restaurant['is_restaurant']
print("There are %.0f features"%len(vectorizer.get_feature_names()))


There are 60142 features


In [54]:
logreg=LogisticRegression()

cv_results=cross_val_score(logreg,X,y,cv=3,scoring='roc_auc')
print("The mean cross validation score of a logisric regression is %.3f "%cv_results.mean())


The mean cross validation score of a logisric regression is 0.909 


#### Task 2c

In [55]:
print("To improve the classification. We should improve the is_restaurant heuristic to be less crude. I found a donut shop for example that was not tagged as a restaurant even though they sell lots of food. This adds noise to the data since people leaving tips may use words to compliment food taste for a business that is not tagged as restaurant.")
print("Another strategy could be to weigh tips by user. There are some users on Yelp who are very active users. It may be possible to weigh tips by how active a user is to help out model be robust against joke tips and unhelpful tips. This strategy relies on Yelp's content moderation as users who activley post many unhelpful tips will ruin the model")

To improve the classification. We should improve the is_restaurant heuristic to be less crude. I found a donut shop for example that was not tagged as a restaurant even though they sell lots of food. This adds noise to the data since people leaving tips may use words to compliment food taste for a business that is not tagged as restaurant.
Another strategy could be to weigh tips by user. There are some users on Yelp who are very active users. It may be possible to weigh tips by how active a user is to help out model be robust against joke tips and unhelpful tips. This strategy relies on Yelp's content moderation as users who activley post many unhelpful tips will ruin the model
