In [None]:
# Basic data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Text + NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('punkt_tab')

# ML / metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# For BERT embeddings
!pip install -q transformers
from transformers import AutoTokenizer, AutoModel
import torch

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
review_path = "/content/drive/MyDrive/Colab Notebooks/review.json"
review_df = pd.read_json(
    review_path,
    lines=True,
    nrows=10000
)
business_path = "/content/drive/MyDrive/Colab Notebooks/business.json"
business_df = pd.read_json(
    business_path,
    lines=True,
    nrows=10000
)
user_path = "/content/drive/MyDrive/Colab Notebooks/user.json"
user_df = pd.read_json(
    user_path,
    lines=True,
    nrows=10000
)

In [None]:
review_df.head()

In [None]:
business_df.head()

In [None]:
user_df.head()

In [None]:
review_df = review_df[['review_id', 'user_id', 'business_id',
                       'stars', 'useful', 'funny', 'cool',
                       'text', 'date']]

business_df = business_df[['business_id', 'name', 'city', 'state',
                           'categories', 'stars']]
user_df = user_df[['user_id', 'review_count', 'average_stars', 'fans']]

In [None]:
# Join business info (city, categories, business stars)
reviews_full = review_df.merge(
    business_df,
    on='business_id',
    how='left',
    suffixes=('_review', '_business')
)

# Join user info (user stats)
reviews_full = reviews_full.merge(
    user_df,
    on='user_id',
    how='left'
)

In [None]:
reviews_full.head()

In [None]:
# Target: helpfulness
reviews_full['helpful'] = reviews_full['useful']

In [None]:
plt.hist(reviews_full['helpful'], bins=50)
plt.yscale('log')
plt.xlabel('Helpful votes')
plt.ylabel('Count (log scale)')
plt.title('Distribution of Helpful Votes')
plt.show()

In [None]:
top_cities = (reviews_full['city']
              .value_counts()
              .head(10)
              .index)

city_group = (reviews_full[reviews_full['city'].isin(top_cities)]
              .groupby('city')['helpful']
              .mean()
              .sort_values(ascending=False))

city_group.plot(kind='bar')
plt.ylabel('Average Helpful Votes')
plt.title('Average Helpfulness by City (Top 10 cities by review count)')
plt.show()

In [None]:
def extract_primary_category(cat_str):
    if pd.isna(cat_str):
        return None
    return cat_str.split(',')[0].strip()

reviews_full['primary_category'] = reviews_full['categories'].apply(extract_primary_category)

top_cats = (reviews_full['primary_category']
            .value_counts()
            .head(10)
            .index)

cat_group = (reviews_full[reviews_full['primary_category'].isin(top_cats)]
             .groupby('primary_category')['helpful']
             .mean()
             .sort_values(ascending=False))

cat_group.plot(kind='bar')
plt.ylabel('Average Helpful Votes')
plt.title('Average Helpfulness by Primary Category (Top 10)')
plt.show()

In [None]:
reviews_full['year'] = reviews_full['date'].dt.year
reviews_full['month'] = reviews_full['date'].dt.month

# Average helpful votes by month
month_group = (reviews_full
               .groupby('month')['helpful']
               .mean())

month_group.plot(kind='line', marker='o')
plt.xticks(range(1, 13))
plt.xlabel('Month')
plt.ylabel('Average Helpful Votes')
plt.title('Helpfulness by Month (Seasonality)')
plt.show()

In [None]:
print("Correlation between useful and cool:",
      reviews_full[['useful', 'cool']].corr().iloc[0,1])

print("Correlation between useful and funny:",
      reviews_full[['useful', 'funny']].corr().iloc[0,1])

# Scatter (use subset for speed)
sample = reviews_full.sample(10000, random_state=42)

plt.scatter(sample['cool'], sample['useful'], alpha=0.3)
plt.xlabel('Cool votes')
plt.ylabel('Useful votes')
plt.title('Useful vs Cool votes (sample)')
plt.show()

plt.scatter(sample['funny'], sample['useful'], alpha=0.3)
plt.xlabel('Funny votes')
plt.ylabel('Useful votes')
plt.title('Useful vs Funny votes (sample)')
plt.show()


In [None]:
# Character length
reviews_full['len_chars'] = reviews_full['text'].str.len()

# Word count
reviews_full['len_words'] = reviews_full['text'].apply(
    lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
)

# Simple EDA: correlation with helpfulness
print(reviews_full[['helpful', 'len_chars', 'len_words']].corr())

In [None]:
sample = reviews_full.sample(10000, random_state=42)
plt.scatter(sample['len_words'], sample['helpful'], alpha=0.2)
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Word count (log)')
plt.ylabel('Helpful votes (log)')
plt.title('Helpfulness vs Text Length (sample)')
plt.show()

In [None]:
stop_words = set(stopwords.words('english'))

def compute_lexical_features(text):
    if not isinstance(text, str) or not text:
        return pd.Series({'lexical_richness': 0,
                          'stopword_ratio': 0})
    tokens = nltk.word_tokenize(text.lower())
    if len(tokens) == 0:
        return pd.Series({'lexical_richness': 0,
                          'stopword_ratio': 0})
    unique_tokens = set(tokens)
    lexical_richness = len(unique_tokens) / len(tokens)
    stop_count = sum(1 for t in tokens if t in stop_words)
    stopword_ratio = stop_count / len(tokens)
    return pd.Series({'lexical_richness': lexical_richness,
                      'stopword_ratio': stopword_ratio})

lex_feats = reviews_full['text'].head(50_000).apply(compute_lexical_features)
reviews_full.loc[lex_feats.index, ['lexical_richness', 'stopword_ratio']] = lex_feats.values
print(reviews_full[['helpful', 'lexical_richness', 'stopword_ratio']].corr())