# Yelp MongoDB Analytics — Exploratory Analysis

This notebook explores restaurant data ingested from the Yelp Fusion API into MongoDB Atlas.

- Top categories by average rating
- Rating vs. review count
- Rating distribution by price level

Note: Ensure you've run the ingestion script and configured `.env` first.


In [None]:
# Imports and setup
import os
from dotenv import load_dotenv

import pandas as pd
import matplotlib.pyplot as plt

from pymongo import MongoClient

import sys
sys.path.append(os.path.abspath(os.path.join('..')))

from src.utils import get_mongo_collection
from src.query_mongodb import (
    average_rating_per_category,
    price_level_distribution,
    rating_reviewcount_pairs,
    ratings_by_price_level,
)

# Inline plots
%matplotlib inline
plt.style.use('ggplot')

# Load env variables (for completeness if ran outside root)
load_dotenv(os.path.join('..', '.env'))

collection = get_mongo_collection()


In [None]:
# Top categories by average rating
results = average_rating_per_category(collection, min_businesses=5)
df_cat = pd.DataFrame(results)

# Keep top 20 for readability
df_top = df_cat.head(20)

ax = df_top.plot(kind='barh', x='category', y='avg_rating', legend=False, figsize=(8, 10), color='#1f77b4')
ax.set_xlabel('Average Rating')
ax.set_ylabel('Category')
ax.set_title('Top Categories by Average Rating (min 5 businesses)')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

df_top.head()


In [None]:
# Rating vs. review count (scatter)
results = rating_reviewcount_pairs(collection, min_review_count=0)
df_pairs = pd.DataFrame(results)

fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(df_pairs['review_count'], df_pairs['rating'], alpha=0.4, s=20, c='#ff7f0e')
ax.set_xlabel('Review Count')
ax.set_ylabel('Rating')
ax.set_title('Rating vs. Review Count')
plt.tight_layout()
plt.show()

df_pairs.describe()


In [None]:
# Rating distribution by price level
results = ratings_by_price_level(collection)
df_price = pd.DataFrame(results)

# Normalize price labels
price_order = ['$', '$$', '$$$', '$$$$', 'Unknown']
df_price['price'] = pd.Categorical(df_price['price'], categories=price_order, ordered=True)

ax = df_price.boxplot(column='rating', by='price', grid=False, figsize=(8, 6))
plt.suptitle('')
plt.title('Rating Distribution by Price Level')
plt.xlabel('Price Level')
plt.ylabel('Rating')
plt.tight_layout()
plt.show()

df_price.groupby('price').rating.describe()


In [None]:
# Correlation between review count and rating
from scipy.stats import pearsonr

if len(df_pairs) > 1:
    corr, pval = pearsonr(df_pairs['review_count'], df_pairs['rating'])
    print(f"Pearson r: {corr:.4f} (p={pval:.4g})")
else:
    print("Not enough data for correlation.")
