In [1]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm

In [10]:
michelin_restaurants_path = 'data/michelin_my_maps.csv'
yelp_restaurants_path = 'data/yelp_dataset/yelp_academic_dataset_business.json'
yelp_reviews_path = 'data/yelp_dataset/yelp_academic_dataset_review.json'
yelp_users_path = 'data/yelp_dataset/yelp_academic_dataset_user.json'

In [3]:
michelin_restaurants = pd.read_csv(michelin_restaurants_path)
yelp_restaurants = pd.read_json(yelp_restaurants_path, lines=True)

In [4]:
yelp_users = pd.read_json(yelp_users_path, lines=True)

In [11]:
yelp_reviews = pd.read_json(yelp_reviews_path, lines=True)

: 

In [None]:
michelin_restaurants.head()

In [None]:
yelp_restaurants.head()

In [None]:
yelp_users.head()

In [None]:
yelp_reviews.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

In [None]:
# Row counts and column counts for each dataset
datasets = {
    "michelin_restaurants": michelin_restaurants,
    "yelp_restaurants": yelp_restaurants,
    "yelp_reviews": yelp_reviews,
    "yelp_users": yelp_users,
}

summary_rows = []
for name, df in datasets.items():
    summary_rows.append({
        "dataset": name,
        "rows": len(df),
        "columns": df.shape[1],
    })

summary_df = pd.DataFrame(summary_rows)
summary_df

In [None]:
# Bar chart: rows per dataset
fig, ax = plt.subplots(figsize=(6, 4))
sns.barplot(data=summary_df.sort_values("rows", ascending=False), x="dataset", y="rows", ax=ax)
ax.set_xlabel("Dataset")
ax.set_ylabel("Row count")
ax.set_title("Rows per dataset")
plt.xticks(rotation=20, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Yelp stars distribution
fig, ax = plt.subplots(figsize=(6, 4))
sns.histplot(yelp_restaurants["stars"].dropna(), bins=20, kde=False, ax=ax)
ax.set_xlabel("Stars")
ax.set_ylabel("Count")
ax.set_title("Yelp restaurant star ratings")
plt.tight_layout()
plt.show()

In [None]:
# Yelp review_count distribution (clipped at 99th percentile to reduce skew)
review_counts = yelp_restaurants["review_count"].dropna()
clip_val = review_counts.quantile(0.99)
review_counts_clipped = review_counts.clip(upper=clip_val)
fig, ax = plt.subplots(figsize=(6, 4))
sns.histplot(review_counts_clipped, bins=40, kde=False, ax=ax)
ax.set_xlabel("Review count (clipped at 99th percentile)")
ax.set_ylabel("Count")
ax.set_title("Yelp review_count distribution")
plt.tight_layout()
plt.show()
print(f"99th percentile review_count: {clip_val:.0f}")

In [None]:
# Michelin award counts
if "Award" in michelin_restaurants.columns:
    award_counts = michelin_restaurants["Award"].value_counts().reset_index()
    award_counts.columns = ["award", "count"]
    fig, ax = plt.subplots(figsize=(6, 4))
    sns.barplot(data=award_counts, x="award", y="count", ax=ax)
    ax.set_xlabel("Award")
    ax.set_ylabel("Count")
    ax.set_title("Michelin awards distribution")
    plt.xticks(rotation=30, ha="right")
    plt.tight_layout()
    plt.show()
else:
    print("Column 'Award' not found in michelin_restaurants")

In [None]:
# Top 20 Yelp cities by restaurant count
city_counts = yelp_restaurants["city"].value_counts().head(20).reset_index()
city_counts.columns = ["city", "count"]
fig, ax = plt.subplots(figsize=(7, 4))
sns.barplot(data=city_counts, x="city", y="count", ax=ax)
ax.set_xlabel("City")
ax.set_ylabel("Restaurant count")
ax.set_title("Top 20 Yelp cities by restaurant count")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Top 20 users by review count
top_users = yelp_users.nlargest(20, "review_count")[["name", "review_count"]].reset_index(drop=True)
fig, ax = plt.subplots(figsize=(7, 4))
sns.barplot(data=top_users, x="review_count", y="name", ax=ax)
ax.set_xlabel("Review count")
ax.set_ylabel("User name")
ax.set_title("Top 20 users by review count")
plt.tight_layout()
plt.show()