Libraries

In [32]:
import json
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
 

Load data

In [None]:
#Documents
#tip_path = "/Users/Andy/Desktop/yelp_academic_dataset_tip.json"
tip_path="../../yelp_dataset/yelp_academic_dataset_tip.json"
tip_doc=open(tip_path,encoding="utf8").readlines()

business_path="../../yelp_dataset/yelp_academic_dataset_business.json"
business_doc=open(business_path,encoding="utf8").readlines()



#Dataframes
busList = []
for bus in business_doc:
    busList.append(json.loads(bus))

dfBus = pd.DataFrame.from_records(busList)

tipList = []
for tip in tip_doc:
    tipList.append(json.loads(tip))

df_Bus = pd.DataFrame.from_records(busList)
df_Tip = pd.DataFrame.from_records(tipList)

df_comb = pd.merge(df_Bus, df_Tip, on='business_id', how='outer')
df_comb=df_comb[df_comb['text'].notna()] # filter on empty tips

## Task 1

#### Task 1a

In [None]:
#TextBlob(df_comb.iloc[2]["text"]).sentiment
#map TextBlob(df_comb.iloc[i]["text"]).sentiment and append as column

df_comb['sentiment'] = df_comb[['text']].applymap(lambda tip: TextBlob(tip).sentiment.polarity)

In [None]:
df_comb.head(3)

#### Task 1b

In [None]:
df_sentiment_by_state = df_comb.groupby("state",as_index=False)['sentiment'].mean()


In [None]:
print("Mean Sentiment by State")
print("")

#filter out candadian provinces
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

df_sentiment_by_USstate = df_sentiment_by_state[df_sentiment_by_state['state'].isin(states)]
print(df_sentiment_by_USstate)

#### Task 1c

In [None]:
import plotly
import plotly.graph_objects as go
#plotly.offline.plot(fig, filename='map.html')


In [None]:
fig=go.Figure(data=go.Choropleth(
    locations=df_sentiment_by_USstate["state"],
    z=df_sentiment_by_USstate['sentiment'].astype(float),
    locationmode='USA-states',
    colorscale='Bluered',
    colorbar_title="Sentiment",
)
)

fig.update_layout(
    title_text='Mean Yelp Tip Seniment Analisis',
    geo_scope='usa'
)
fig.show()

## Task 2

#### Task 2a

In [None]:
df_AZ = df_Bus[df_Bus["state"]=="AZ"]
df_AZ=df_AZ[df_AZ['categories'].notna()]

df_AZ['is_restaurant'] = df_AZ[['categories']].applymap(lambda cat_list: "Restaurants" in cat_list)

df_tip_restaurant=pd.merge(df_AZ,df_Tip, on='business_id',how='left')
df_tip_restaurant=df_tip_restaurant[df_tip_restaurant['text'].notna()] # filter on empty tips

df_tip_restaurant=df_tip_restaurant[['text','is_restaurant']]



#### Task 2b

In [33]:
#generate feature matrix
corpus = df_tip_restaurant['text']
vectorizer=TfidfVectorizer(stop_words='english')
X=vectorizer.fit_transform(corpus)
y=df_tip_restaurant['is_restaurant']
print("There are %.0f features"%len(vectorizer.get_feature_names()))


There are 60142 features


In [36]:
logreg=LogisticRegression()

cv_results=cross_val_score(logreg,X,y,cv=3,scoring='roc_auc')
print("The mean cross validation score of a logisric regression is %.3f "%cv_results.mean())


The mean cross validation score of a logisric regression is 0.909 


#### Task 2c