The goal of this task is to explore the Yelp data set to get a sense about what the data look like and their characteristics. You can think about the goal as being to answer questions such as:

What are the major topics in the reviews? Are they different in the positive and negative reviews? Are they different for different cuisines? What does the distribution of the number of reviews over other variables (e.g., cuisine, location) look like? What does the distribution of ratings look like? In general, you can address such questions by showing visualization of statistics computed based on the data set or topics extracted from review text.

You must complete the following specific tasks.

Task 1.1:

Use a topic model (e.g., PLSA or LDA) to extract topics from all the review text (or a large sample of them) and visualize the topics to understand what people have talked about in these reviews. For example, after applying LDA to a sample of the reviews, we obtained the following visualization. Here the opacity of each node corresponds to its weight in each topic.

Task 1.2:

Do the same for two subsets of reviews that are interesting to compare (e.g., positive vs. negative reviews for a particular cuisine or restaurant), and visually compare the topics extracted from the two subsets to help understand the similarity and differences between these topics extracted from the two subsets. You can form these two subsets in any way that you think is interesting. Here we show a sample visualization for a sample of reviews with high and low ratings.

In [None]:
import json
import os
import pandas as pd
import glob
import plotly.graph_objects as go
import nltk
from plotly.graph_objects import Scatter, Layout, Marker, Line
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# already executed
# def sample_json_data(file_pattern, sample_size=100):
#     """Reads JSON files and returns a sample of the data."""
#     all_files = glob.glob(file_pattern)
#     sample_data = []
#     for file in all_files:
#         with open(file, 'r') as f:
#             for line in f:
#                 try:
#                     json_data = json.loads(line.strip())
#                     sample_data.append(json_data)
#                     if len(sample_data) >= sample_size:
#                         return pd.DataFrame(sample_data)
#                 except json.JSONDecodeError as e:
#                     print(f"Error decoding JSON: {e}")
#     return pd.DataFrame(sample_data)

# df_sample = sample_json_data(r'D:\LEARNING AND COURSES\COURSERA\Data Mining Specialization\6. Data Mining Project\yelp_dataset_challenge_academic_dataset\dataset\yelp_academic_dataset_review.json', sample_size=500)
# df_sample.to_json('sample_data.json', orient='records', lines=True)

In [None]:
# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Check Duplicates

In [None]:
# Function to check for duplicates in a multi-line JSON file
def check_duplicates_in_multiline_json(file_path, key=None):
    try:
        # Store each JSON object in a list
        data = []
        duplicates = []
        seen = set()

        # Read the file line-by-line
        with open(file_path, 'r') as file:
            for line in file:
                try:
                    item = json.loads(line.strip())  # Parse each line as a JSON object
                    if key:
                        # Check for duplicates based on a specific key
                        if item.get(key) in seen:
                            duplicates.append(item)
                        else:
                            seen.add(item.get(key))
                    else:
                        # Check for exact duplicate records
                        item_tuple = tuple(sorted(item.items())) if isinstance(item, dict) else tuple(item)
                        if item_tuple in seen:
                            duplicates.append(item)
                        else:
                            seen.add(item_tuple)

                    data.append(item)  # Add each parsed item to the data list
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {line}")
                    print(f"JSONDecodeError: {e}")

        # Output duplicates if found
        if duplicates:
            print(f"Found {len(duplicates)} duplicates in the JSON file.")
            for i, duplicate in enumerate(duplicates, 1):
                print(f"Duplicate {i}: {duplicate}")
        else:
            print("No duplicates found.")

    except Exception as e:
        print(f"Error processing file: {e}")

# File path to your multi-line JSON file
file_path = '/content/drive/MyDrive/Data Mining Project UIUC/dataset/yelp_academic_dataset_business.json'

# Check for duplicates in the multi-line JSON file
# If you want to check for duplicates based on a specific key, pass the key as a second argument (e.g., 'business_id')
check_duplicates_in_multiline_json(file_path, key='business_id')

No duplicates found.


Topics Extraction and Visualization  

Get 10 topics from 100 samples data using LDA. Then, visualize the topics using radia dendrogram from D3.js library and clusters bubble chart.

In [None]:
def read_json_files(file):
    """Reads a JSON file and returns a DataFrame."""
    data = []
    with open(file, 'r') as f:
        for line in f:
            try:
                json_data = json.loads(line.strip())
                data.append(json_data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
    return pd.DataFrame(data)

def preprocess_text(text):
    """Preprocess the review text."""
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    return ' '.join([word for word in tokens if word.isalnum() and word not in stop_words])

def readdata(file, topic_count):
    """Reads data from JSON files and performs LDA topic modeling."""
    df = read_json_files(file)
    df['processed_text'] = df['text'].apply(preprocess_text)

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['processed_text'])

    lda = LatentDirichletAllocation(n_components=topic_count, random_state=0)
    topic_distribution = lda.fit_transform(X)  # Getting topic distribution for each document

    # Calculate topic frequencies
    topic_frequencies = topic_distribution.sum(axis=0)

    # Get topics and words
    feature_names = vectorizer.get_feature_names_out()
    topics = {}
    for topic_idx, topic in enumerate(lda.components_):
        top_features_ind = topic.argsort()[-10:]  # Adjust number of words per topic
        top_features = [feature_names[i] for i in top_features_ind]
        topics[topic_idx] = top_features

    return topics, topic_frequencies

def export_topics_to_json(topics, file_name='topics.json'):
    """Exports topics dictionary to a JSON file."""
    with open(file_name, 'w') as f:
        json.dump(topics, f)

In [None]:
def build_text(key, values):
    result = 'Topic ' + str(key) + ': '
    for value in values:
        result += (value + ' ')
    return [result]

def build_trace(dataset, topic_frequencies):
    traces = []
    max_frequency = max(topic_frequencies)  # To scale sizes

    for key, frequency in enumerate(topic_frequencies):
        trace = go.Scatter(
            x=[key + 1],  # Topic ID starts from 1
            y=[frequency],  # Group Frequency based on calculated values
            text=build_text(key, dataset[key]),
            mode='markers',
            name='Topic ' + str(key + 1),
            marker=go.scatter.Marker(
                sizemode='diameter',
                sizeref=2. * max_frequency / (30. ** 2),  # Adjust size scaling
                size=[30. * (frequency / max_frequency)],  # Scale size by frequency
                opacity=frequency / max_frequency,  # Adjust opacity
                line=go.scatter.marker.Line(width=2),
            )
        )
        traces.append(trace)
    return traces

def build_layout():
    layout = Layout(
        title='Cluster of Topics by LDA',
        showlegend=False,
        height=600,
        width=800,
        xaxis=dict(
            title='Topic ID',
            gridcolor='rgb(255, 255, 255)',
            zerolinewidth=1,
            ticklen=5,
            gridwidth=1,
        ),
        yaxis=dict(
            title='Group Frequency (1000x)',
            gridcolor='rgb(255, 255, 255)',
            zerolinewidth=1,
            ticklen=5,
            gridwidth=2,
        ),
        paper_bgcolor='rgb(243, 243, 243)',
        plot_bgcolor='rgb(243, 243, 243)',
    )
    return layout

In [None]:
# Main function to use topic frequencies
def main(file):
    dataset, topic_frequencies = readdata(file, 10)  # Get topics and their frequencies
    export_topics_to_json(dataset)  # Export topics to JSON file
    print("Topics successfully exported to topics.json")
    traces = build_trace(dataset, topic_frequencies)
    layout = build_layout()
    fig = go.Figure(data=traces, layout=layout)
    fig.write_html('topic_clusters.html')

if __name__ == '__main__':
    main('/content/drive/MyDrive/Data Mining Project UIUC/task1/sample_data.json')

Topics successfully exported to topics.json


**Extract all categories in the business dataset**  

In this section, we'll extract all the topics and preprocess the review of each business using LDA. Then, we'll compare some restaurants based on their review for each rating (1-5). We'll choose mexican food restaurant to compare each restaurant than others.

In [None]:
# Define paths to your JSON files
business_file_path = '/content/drive/MyDrive/Data Mining Project UIUC/dataset/yelp_academic_dataset_business.json'
review_file_path = '/content/drive/MyDrive/Data Mining Project UIUC/dataset/yelp_academic_dataset_review.json'

In [None]:
# Function to load business information
def load_business_info(business_file):
    business_info = {}
    with open(business_file, 'r') as f:
        for line in f:
            business_json = json.loads(line)
            business_id = business_json['business_id']
            name = business_json['name'].replace('/', '&')
            business_info[business_id] = {
                'name': name,
                'stars': business_json['stars'],
                'reviews': {}
            }
    return business_info

# Function to map reviews to businesses
def map_reviews_to_business(review_file, business_info):
    with open(review_file, 'r') as f:
        for line in f:
            review_json = json.loads(line)
            business_id = review_json['business_id']
            if business_id in business_info:
                stars = int(review_json['stars'])
                review = review_json['text']
                if stars not in business_info[business_id]['reviews']:
                    business_info[business_id]['reviews'][stars] = []
                business_info[business_id]['reviews'][stars].append(review)
    return business_info

# Function to apply LDA and extract topics with word scores
def extract_lda_topics_with_scores(reviews, n_topics=5):
    vectorizer = CountVectorizer(stop_words='english', max_features=5000)
    X = vectorizer.fit_transform(reviews)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(X)
    topics = {}
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_features_ind = topic.argsort()[-10:]  # Get top 10 words for each topic
        top_features = [(feature_names[i], topic[i]) for i in top_features_ind]
        topics[f'Topic {topic_idx}'] = {word: score for word, score in top_features}
    return topics

# Function to save topics with scores for each restaurant and each rating
def save_lda_topics_with_scores_by_rating(business_info, output_path='lda_output'):
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    for business_id, info in business_info.items():
        restaurant_name = info['name']
        reviews_by_rating = info['reviews']
        for rating, reviews in reviews_by_rating.items():
            if len(reviews) > 0:
                topics_with_scores = extract_lda_topics_with_scores(reviews)
                output_file = f"{output_path}/{restaurant_name}_rating_{rating}_lda_topics.json"
                with open(output_file, 'w') as f:
                    json.dump(topics_with_scores, f, indent=4)
                print(f"Saved LDA topics with scores for {restaurant_name} (Rating {rating})")

# Load business information
business_info = load_business_info(business_file_path)

# Map reviews to businesses
business_info = map_reviews_to_business(review_file_path, business_info)

# Save LDA topics with word scores for each restaurant and each rating
save_lda_topics_with_scores_by_rating(business_info, output_path='/content/drive/MyDrive/Data Mining Project UIUC/task1/lda_topics_with_scores_output')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved LDA topics with scores for Super Dragon (Rating 5)
Saved LDA topics with scores for Desert Mission Food Bank (Rating 4)
Saved LDA topics with scores for Desert Mission Food Bank (Rating 5)
Saved LDA topics with scores for China Super Buffet (Rating 2)
Saved LDA topics with scores for China Super Buffet (Rating 3)
Saved LDA topics with scores for China Super Buffet (Rating 4)
Saved LDA topics with scores for Stumpys Pizza (Rating 2)
Saved LDA topics with scores for Stumpys Pizza (Rating 4)
Saved LDA topics with scores for Stumpys Pizza (Rating 5)
Saved LDA topics with scores for Stumpys Pizza (Rating 1)
Saved LDA topics with scores for Stumpys Pizza (Rating 3)
Saved LDA topics with scores for Jiffy Lube (Rating 5)
Saved LDA topics with scores for Jiffy Lube (Rating 4)
Saved LDA topics with scores for Jiffy Lube (Rating 2)
Saved LDA topics with scores for Lantana Grille (Rating 3)
Saved LDA topics with scores for Lant

ValueError: empty vocabulary; perhaps the documents only contain stop words

**Compare the Business**

In [None]:
# Define the path to your output files
input_path = '/content/drive/MyDrive/Data Mining Project UIUC/task1/lda_topics_with_scores_output'
output_path = '/content/drive/MyDrive/Data Mining Project UIUC/task1/compare_business_viz.json'

In [None]:
# Select 3 restaurants for visualization
selected_restaurants = [
    "Filiberto's Mexican Food",
    "Carolina's Mexican Food",
    "Elvira's Mexican Food"
]

# Data structure to hold all visualization data
visualization_data = {}

# Iterate through selected restaurants and collect top 5 words for each rating
for restaurant in selected_restaurants:
    restaurant_data = {}
    for rating in range(1, 6):  # Ratings from 1 to 5
        filename = f"{input_path}/{restaurant}_rating_{rating}_lda_topics.json"
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                topics_data = json.load(f)
                top_words_data = []

                # Collect top 5 words for each rating (aggregate across all topics)
                word_scores = {}
                for topic, words_scores in topics_data.items():
                    for word, score in words_scores.items():
                        if word in word_scores:
                            word_scores[word] += score
                        else:
                            word_scores[word] = score

                # Sort and select top 5 words
                top_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
                top_words_data = [{"word": word, "score": score} for word, score in top_words]

                # Add top words data to rating
                restaurant_data[f"Rating_{rating}"] = top_words_data
        else:
            print(f"File not found: {filename}")
    visualization_data[restaurant] = restaurant_data

# Save the visualization data to a JSON file
with open(output_path, 'w') as f:
    json.dump(visualization_data, f, indent=4)

print(f"Visualization data saved to {output_path}")

Visualization data saved to /content/drive/MyDrive/Data Mining Project UIUC/task1/compare_business_viz.json
