# Yelp Business Insights Project - Teacher Version

This notebook provides a complete solution for the Yelp Business Insights Project. It integrates NLP, deep learning, and reinforcement learning using the Yelp Open Dataset.


In [None]:
# Download and extract the Yelp dataset
!wget -O yelp_dataset.tar.gz "https://s3.amazonaws.com/yelp-dataset/yelp_dataset_challenge_academic_dataset.tar.gz"
!tar -xzf yelp_dataset.tar.gz


## Data Preprocessing and Exploratory Analysis

The following code loads a sample of Yelp reviews and performs basic exploratory analysis.


In [None]:
import json
import pandas as pd

# Load a sample from review.json (adjust sample size as needed)
reviews = []
with open('yelp_academic_dataset_review.json', 'r') as f:
    for i, line in enumerate(f):
        if i >= 10000:  # sample first 10,000 reviews
            break
        reviews.append(json.loads(line))
        
df_reviews = pd.DataFrame(reviews)
print(df_reviews.head())


## Challenge 1: NLP – Sentiment Analysis of Yelp Reviews

**Goal:** Build a sentiment analysis classifier using Yelp review texts. Label reviews with stars ≥ 4 as positive (1) and the rest as negative (0).  
**Pass Criterion:** The model must achieve at least **70% accuracy** on a held-out validation set.


In [None]:
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.optim as optim

# Labeling: star rating ≥ 4 -> positive (1), else negative (0)
df_reviews['sentiment'] = (df_reviews['stars'] >= 4).astype(int)
texts = df_reviews['text'].tolist()
labels = df_reviews['sentiment'].tolist()

# Create a bag-of-words representation
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(texts).toarray()

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, labels, test_size=0.2, random_state=42)

# Define the sentiment analysis model using PyTorch
class SentimentMLP(nn.Module):
    def __init__(self, input_dim):
        super(SentimentMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return torch.sigmoid(self.net(x))

input_dim = X_train.shape[1]
model = SentimentMLP(input_dim)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        predictions = (val_outputs > 0.5).float()
        accuracy = (predictions.eq(y_val_tensor).sum() / float(y_val_tensor.shape[0])).item()
    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}, Val Accuracy: {accuracy:.2f}")

# Final check for pass criterion
if accuracy >= 0.70:
    print("PASS: Sentiment analysis accuracy meets the threshold.")
else:
    print("FAIL: Please review your model and preprocessing steps.")


## Challenge 2: Deep Learning – Restaurant Rating Prediction

**Goal:** Build a neural network to predict a restaurant’s star rating using features extracted from the Yelp dataset.  
**Pass Criterion:** The model should achieve a test accuracy of **at least 80%**.


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

# For demonstration, we simulate a feature matrix and ratings.
n_samples = 1000
X_features = np.random.rand(n_samples, 5).astype(np.float32)
y_ratings = np.random.randint(0, 5, size=(n_samples,))

X_train, X_test, y_train, y_test = train_test_split(X_features, y_ratings, test_size=0.3, random_state=42)

# Define the rating prediction model using PyTorch
class RatingMLP(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(RatingMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, num_classes)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

num_classes = 5
model_rating = RatingMLP(input_dim=5, num_classes=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_rating.parameters(), lr=0.01)

X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Training loop for rating prediction model
num_epochs = 50
for epoch in range(num_epochs):
    model_rating.train()
    optimizer.zero_grad()
    outputs = model_rating(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        model_rating.eval()
        with torch.no_grad():
            test_outputs = model_rating(X_test_tensor)
            _, predicted = torch.max(test_outputs, 1)
            accuracy = (predicted == y_test_tensor).float().mean().item()
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}, Test Accuracy: {accuracy:.2f}")

# Final check for pass criterion
if accuracy >= 0.80:
    print("PASS: Rating prediction model meets the accuracy threshold.")
else:
    print("FAIL: Please review your model and feature engineering steps.")


NameError: name 'nn' is not defined

## Challenge 3: Reinforcement Learning – Restaurant Recommendation Simulation

**Goal:** Simulate a restaurant recommendation system using an epsilon-greedy multi-armed bandit.  
**Pass Criterion:** The simulation must achieve an average reward of at least **0.65** with epsilon = 0.1 over 1000 steps.


In [None]:
import numpy as np

def simulate_bandit(epsilon, steps=1000):
    np.random.seed(42)
    # Define true conversion probabilities for 3 recommendations
    true_probs = [0.2, 0.5, 0.7]
    num_arms = len(true_probs)
    
    Q_estimates = np.zeros(num_arms)
    counts = np.zeros(num_arms)
    rewards = []
    
    for step in range(steps):
        if np.random.rand() < epsilon:
            chosen_arm = np.random.choice(num_arms)
        else:
            chosen_arm = np.argmax(Q_estimates)
            
        reward = 1 if np.random.rand() < true_probs[chosen_arm] else 0
        counts[chosen_arm] += 1
        Q_estimates[chosen_arm] = Q_estimates[chosen_arm] + (reward - Q_estimates[chosen_arm]) / counts[chosen_arm]
        rewards.append(reward)
    
    average_reward = np.mean(rewards)
    return average_reward, Q_estimates, counts

avg_reward, Q_estimates, counts = simulate_bandit(epsilon=0.1, steps=1000)
print("Average Reward:", avg_reward)
print("Estimated Q-values:", Q_estimates)
print("Counts:", counts)

threshold = 0.65
if avg_reward >= threshold:
    print(f"PASS: Average reward of {avg_reward:.3f} meets or exceeds the threshold.")
else:
    print(f"FAIL: Average reward of {avg_reward:.3f} is below the threshold.")


## Final Reflection and Submission

Reflect on how each approach (NLP, deep learning, reinforcement learning) applies to real business scenarios using Yelp data. Discuss any challenges encountered and potential improvements.  
- Ensure the notebook runs from start to finish without errors.  
- Submit the completed notebook.
