In [None]:
% matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import pyarrow as pa   
import pyarrow.parquet as pq
from fastparquet import ParquetFile
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob, Word
from sklearn.externals import joblib
import json

In [None]:
yelp_dir = Path('data', 'yelp')
parquet_dir = yelp_dir / 'parquet'
if not parquet_dir.exists():
    parquet_dir.mkdir(exist_ok=True)

In [None]:
for file in ['review', 'user']:
    print(file)
    json_file = yelp_dir / 'json' / f'yelp_academic_dataset_{file}.json'
    parquet_file = parquet_dir / f'{file}.parquet'

    data = json_file.read_text(encoding='utf-8')
    json_data = '[' + ','.join([l.strip()
                                for l in data.split('\n') if l.strip()]) + ']\n'
    data = json.loads(json_data)
    df = json_normalize(data)
    if file == 'review':
        df.date = pd.to_datetime(df.date)
        latest = df.date.max()
        df['year'] = df.date.dt.year
        df['month'] = df.date.dt.month
        df = df.drop(['date', 'business_id', 'review_id'], axis=1)
    if file == 'user':
        df.yelping_since = pd.to_datetime(df.yelping_since)
        df = (df.assign(member_yrs=lambda x: (latest - x.yelping_since)
                        .dt.days.div(365).astype(int))
              .drop(['elite', 'friends', 'name', 'yelping_since'], axis=1))
    df.dropna(how='all', axis=1).to_parquet(parquet_file, compression='gzip')
    try:
        pd.read_parquet(parquet_file, engine='pyarrow')
    except Exception as e:
        print(e)
        pd.read_parquet(parquet_file, engine='fastparquet')

In [None]:
user = pd.read_parquet(parquet_dir / 'user.parquet')
review = pd.read_parquet(parquet_dir / 'review.parquet', engine='fastparquet')

In [None]:
user.head()

In [None]:
review.head()

Merge user dataframe and review dataframe, we use suffix to deal with the same column names.

In [None]:
user_review = (review.merge(user, on='user_id', how='left', suffixes=['', '_user']).drop('user_id', axis=1))

In [None]:
user_review.columns

In [None]:
user_review.stars.describe()

We will remove zero stars.

In [None]:
user_review = user_review[user_review.stars > 0]

In [None]:
x=user_review['stars'].value_counts()
x=x.sort_index()
plt.figure(figsize=(10,6))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Star Rating Distribution")
plt.ylabel('count')
plt.xlabel('Star Ratings')
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show();

Good to know that most of the reviews star ratings are pretty high. And not many terrible reviews. Obvious, there’s an incentive for businesses to solicit as many good reviews as possible.

In [None]:
user_review.stars.value_counts()

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 4))
user_review.year.value_counts().sort_index().plot.bar(title='Reviews per Year', ax=axes[0]);
sns.lineplot(x='year', y='stars', data=user_review, ax=axes[1])
axes[1].set_title('Stars per year');

In [None]:
user_review.member_yrs.value_counts()

There are over 4,000 members have been a Yelp member as long as it was founded.

In [None]:
user_review.head(3)

Let's have a look a review sample.

In [None]:
review_sample = user_review.text.sample(1).iloc[0]
print(review_sample)

Let's check the polarity of this review sample. Polarity ranges from -1 (most negative) to 1 (most positive).

In [None]:
TextBlob(review_sample).sentiment

The above review has a polarity of about -0.06, meaning it is slightly negative, and a subjectivity of about 0.56, meaning it is fairly subjective.

To proceed faster, we will sample 1 million reviews from our current data. And add a new column for polarity.

In [None]:
sample_reviews = user_review[['stars', 'text']].sample(1000000)

Define a function that accepts text and return the polarity.

In [None]:
def detect_polarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
sample_reviews['polarity'] = sample_reviews.text.apply(detect_polarity)
sample_reviews.head()

First several rows look good, stars and polarity are in line with each other, means the higher the star, the higher the polarity, as it should be. 

In [None]:
# A histogram of the polarity scores.
num_bins = 50
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(sample_reviews.polarity, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();

Most polarity scores are above 0, means most of them are postive sentiment, this is in line with the star distribution. 

In [None]:
# Box plot of sentiment grouped by stars
plt.figure(figsize=(10,6))
sns.boxenplot(x='stars', y='polarity', data=sample_reviews)
plt.show();

In general, this is as good as we'd expect. Let's investigate deeper and see whether we can find anything interesting or outlier.

In [None]:
pd.set_option('max_colwidth', 400)

Reviews that have the lowest polarity:

In [None]:
sample_reviews[sample_reviews.polarity == -1].text.head()

Reviews that have the lowest stars:

In [None]:
sample_reviews[sample_reviews.stars == 1].text.head()

Reviews that have lowest polarity (most negative sentiment) but with a 5-star:

In [None]:
sample_reviews[(sample_reviews.stars == 5) & (sample_reviews.polarity == -1)].head(10)

Reviews that have the highest polarity (most positive sentiment) but with a 1-star:

In [None]:
sample_reviews[(sample_reviews.stars == 1) & (sample_reviews.polarity == 1)].head(10)

TextBlob goes along finding words and phrases it can assign polarity and subjectivity to, and it averages them all together for longer text, such as our Yelp reviews. Want to learn how TextBlob calculates polarity and subjectivity, this article from Aaron has given a simple but clear explanation. https://planspace.org/20150607-textblob_sentiment/