# Project Setup

In [1]:
!pip install spacy nltk
!pip install transformers torch
!pip install accelerate -U
!python -m spacy download en_core_web_sm

!wget --no-check-certificate 'https://drive.usercontent.google.com/uc?id=1rQeOakOJ6xxIY-q--u3SlSE3g7qlxVTE&authuser=0&export=download' -O combined_dataset.csv
!wget --no-check-certificate 'https://drive.usercontent.google.com/uc?id=18tJoEfHHKp8hQaVP0FP9FeOItcm6FWTo&authuser=0&export=download' -O frequent_patterns.pkl

#from google.colab import drive
#drive.mount('/content/drive')

Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.32.1
    Uninstalling accelerate-0.32.1:
      Successfully uninstalled accelerate-0.32.1
Successfully installed accelerate-0.33.0
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Ju

In [2]:
import spacy
import json
import pandas as pd
import random
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import csv
import pickle
import torch
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')

porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

# DRIVE = 'drive/MyDrive/'
DRIVE = ''
DATASET_NAME = 'combined_dataset.csv'
MINED_PATTERNS_NAME = 'frequent_patterns.pkl'
PATTERN_PATH = DRIVE + MINED_PATTERNS_NAME
DATASET_PATH = DRIVE + DATASET_NAME

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
review_df = pd.read_csv(DATASET_PATH)
mined_patterns = pickle.load(open(PATTERN_PATH, 'rb'))

# Get all reviews for a random product

In [4]:
def get_random_reviews(df: pd.DataFrame):
  # Select a random parent_asin from the DataFrame
  random_parent_asin = random.choice(df['parent_asin'].unique())

  # Filter the DataFrame to include only rows with the selected parent_asin
  filtered_df = df[df['parent_asin'] == random_parent_asin]

  # Convert the filtered DataFrame to the list of dictionaries
  reviews_list = filtered_df.to_dict(orient='records')

  return reviews_list

# Identify frequent nouns in the reviews

In [5]:
def get_frequent_nouns(reviews):
  total_sentences = 0
  noun_phrase_counter = Counter()
  for review in reviews:
    cleaned_text = remove_html_tags(review['review'])
    noun_phrases = extract_noun_phrases(cleaned_text)
    cleaned_noun_phrases = [phrase for phrase in noun_phrases if phrase]
    noun_phrase_counter.update(cleaned_noun_phrases)
    total_sentences += count_sentences(cleaned_text)

  threshold = total_sentences * 0.01
  frequent_noun_phrases = [phrase for phrase, count in noun_phrase_counter.items() if count > threshold]

  return frequent_noun_phrases

def remove_html_tags(text):
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

def pos_tag(text):
  doc = nlp(text)
  pos_tags = [(chunk.text, chunk.pos_) for chunk in doc.noun_chunks]
  return pos_tags

def stem_nouns(pos_tagged_text):
  stemmed_nouns = []
  for word, tag in pos_tagged_text:
    if tag == 'NOUN':
      stemmed_nouns.append(porter.stem(word))
  return stemmed_nouns

def filter_stop_words(nouns):
  return [word for word in nouns if word.lower() not in stop_words]

def extract_noun_phrases(text):
  doc = nlp(text)
  noun_phrases = []

  for chunk in doc.noun_chunks:
    noun_only_phrase = ' '.join([porter.stem(token.text) for token in chunk if token.pos_ == 'NOUN' and token.text.lower() not in stop_words])
    if noun_only_phrase:
      noun_phrases.append(noun_only_phrase)

  return noun_phrases

def count_sentences(text):
  doc = nlp(text)
  return len(list(doc.sents))

# Get the "unstemmed" version of the frequent nouns in a review

In [6]:
def unstem_nouns_in_review(frequent_nouns, review_text):
    review_words = review_text.split()

    noun_map = {}

    for word in review_words:
        stemmed_word = porter.stem(word)

        if stemmed_word in frequent_nouns:
            if stemmed_word not in noun_map:
                noun_map[stemmed_word] = word

    return list(noun_map.values())

# Get "unstemmed" frequent nouns in a review that match the mined patterns for product aspects

In [7]:
def extract_nouns_from_review(review, mined_patterns, nouns_in_review):
    # Clean the review and tokenize it
    cleaned_text = remove_html_tags(review)
    doc = nlp(cleaned_text)
    sentences = [sent.text for sent in doc.sents]

    extracted_aspects = []

    for sentence in sentences:
        doc_sentence = nlp(sentence)

        # Iterate through noun chunks in the sentence
        for chunk in doc_sentence.noun_chunks:
            pos_pattern = []
            aspect_nouns = []

            # Generate the POS pattern and identify aspect nouns
            for token in chunk:
                if token.pos_ == 'NOUN':
                    pos_pattern.append('_ASP')
                    aspect_nouns.append(token.text)
                else:
                    pos_pattern.append(token.pos_)

            pattern_str = ' '.join(pos_pattern)

            # Check if the pattern matches any mined pattern
            if pattern_str in mined_patterns:
                # Join the aspect nouns to handle multi-word nouns
                extracted_aspect = ' '.join(aspect_nouns)

                # Check if the extracted aspect matches any noun in the review noun list
                for noun in nouns_in_review:
                    # Use regex to handle matching with multi-word nouns
                    if re.search(r'\b' + re.escape(extracted_aspect) + r'\b', noun):
                        extracted_aspects.append(extracted_aspect)

    return extracted_aspects

# Get review key points for all extracted product aspects using the QA system

In [8]:
model_checkpoint = 'noahjl/distilbert-base-cased-distilled-squad-finetuned-squad'

tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
model = DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)

def answer_question(context: str, question: str) -> str:
    # Tokenize the input (context and question)
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')

    # Forward pass through the model to get start and end logits
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

    # Get the most likely start and end token positions
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits) + 1

    # Decode the answer from token ids
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index])
    )

    return answer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/597 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [9]:
"""def get_review_key_points(product_reviews):
  frequent_nouns = get_frequent_nouns(product_reviews)
  extracted_review_key_points = []

  for review in product_reviews:
    unstemmed_nouns = unstem_nouns_in_review(frequent_nouns, review['review'])
    extracted_aspects = extract_nouns_from_review(review['review'], mined_patterns, unstemmed_nouns)
    review_key_points = []
    for aspect in extracted_aspects:
      try:
        question = f"What does the reviewer say about the {aspect}?"
        answer = answer_question(review['review'], aspect)
        if "[SEP]" in answer:
          answer = answer.split("[SEP]")[1]
        if answer != "" and "[CLS]" not in answer:
          answer_words = answer.split()
          if len(answer_words) > 1 and len(answer_words) < 10:
            review_key_points.append(answer)
      except:
        continue
    extracted_review_key_points.extend(list(set(review_key_points)))
  return extracted_review_key_points"""

def get_review_key_points(product_reviews):
    frequent_nouns = get_frequent_nouns(product_reviews)
    extracted_reviews = []

    for review in product_reviews:
        unstemmed_nouns = unstem_nouns_in_review(frequent_nouns, review['review'])
        extracted_aspects = extract_nouns_from_review(review['review'], mined_patterns, unstemmed_nouns)
        review_key_points = []

        for aspect in extracted_aspects:
            try:
                question = f"What does the reviewer mention about the {aspect}?"
                answer = answer_question(review['review'], aspect)

                if "[SEP]" in answer:
                    answer = answer.split("[SEP]")[1]

                if answer != "" and "[CLS]" not in answer:
                    answer_words = answer.split()

                    if len(answer_words) > 1 and len(answer_words) < 10:
                        review_key_points.append(answer)

            except Exception as e:
                continue

        # Remove duplicates from the review key points
        review_key_points = list(set(review_key_points))

        extracted_reviews.append({
            "review_text": review['review'],
            "review_key_points": review_key_points,
            "rating": review.get('rating', None)
        })

    return extracted_reviews


# More Preparation

In [None]:
# Sentiment
!mkdir absa_handler/
!mkdir output/

!wget https://raw.githubusercontent.com/burakyuslu/predicting_ratings_using_graphs/dataset_and_qa/absa_handler/__init__.py -O absa_handler/__init__.py
!wget https://raw.githubusercontent.com/burakyuslu/predicting_ratings_using_graphs/dataset_and_qa/absa_handler/add_predicted_sentiments.py -O absa_handler/add_predicted_sentiments.py
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1S3juHft_G2No7tJtXg9bQbcCsrMqdF0G&export=download&authuser=0&confirm=t' -O absa_handler/sentiment_model_completed.zip

import zipfile
with zipfile.ZipFile('absa_handler/sentiment_model_completed.zip', 'r') as zip_ref:
    zip_ref.extractall('')

import sys
import os
sys.path.append('/content/absa_handler')

from add_predicted_sentiments import add_predicted_sentiments

# Graph

!pip install pyvis
!pip install networkx

!mkdir graph_handler/
!mkdir prediction_handler/

!wget https://raw.githubusercontent.com/burakyuslu/predicting_ratings_using_graphs/dataset_and_qa/graph_handler/__init__.py -O graph_handler/__init__.py
!wget https://raw.githubusercontent.com/burakyuslu/predicting_ratings_using_graphs/dataset_and_qa/graph_handler/graph_handler.py -O graph_handler/graph_handler.py
!wget https://raw.githubusercontent.com/burakyuslu/predicting_ratings_using_graphs/dataset_and_qa/prediction_handler/__init__.py -O prediction_handler/__init__.py
!wget https://raw.githubusercontent.com/burakyuslu/predicting_ratings_using_graphs/dataset_and_qa/prediction_handler/prediction_handler.py -O prediction_handler/prediction_handler.py

from graph_handler import GraphHandler
from prediction_handler import PredictionHandler  # Ensure this is defined in another file or added here
import gensim.downloader as api

word2vec_model = api.load('word2vec-google-news-300')

# Usage

In [31]:
# Review Key Points
product_reviews = get_random_reviews(review_df)
data = get_review_key_points(product_reviews)
cleaned_data = [item for item in data if item.get('review_key_points')]


with open('output/review_keypoints.json', 'w') as json_file:
    json.dump(cleaned_data, json_file, indent=4)

# Sentiment
add_predicted_sentiments('output/review_keypoints.json', 'output/review_keypoints_sentiments.json')

# Graph
# Initialize GraphHandler with the Word2Vec model
graph_handler = GraphHandler(word2vec_model)

# Load reviews from JSON
input_json_path = '/content/output/review_keypoints_sentiments.json'
reviews = graph_handler.load_reviews(input_json_path)

# Generate embeddings and labels for each RKP
embeddings, rkp_labels, rkp_ratings, sentiments = graph_handler.generate_embeddings(reviews)

# perform sentiment-aware clustering
num_clusters = 7
labels = graph_handler.perform_sentiment_aware_clustering(embeddings, rkp_labels, sentiments, num_clusters)

# construct and save the graph
output_graph_path = '/content/output/output_graph.gpickle'
graph = graph_handler.construct_graph(reviews, (embeddings, rkp_labels, rkp_ratings, sentiments), labels)
graph_handler.save_graph(graph, output_graph_path)

# Save cluster information (mostly debug, but it is also something interesting by itself)
output_cluster_path = '/content/output/clusters.txt'
graph_handler.save_clusters(rkp_labels, labels, output_cluster_path)

# Visualize the graph
output_directory = '/content/output/graph_visualization.html'
graph_handler.visualize_graph(graph, output_directory)

# predict and evaluate ratings
prediction_handler = PredictionHandler(graph, word2vec_model)

prediction_input_json_path = '/content/output/review_keypoints_sentiments.json'
reviews_to_predict = graph_handler.load_reviews(prediction_input_json_path)

prediction_results, accuracy = prediction_handler.evaluate_predictions(reviews_to_predict)
prediction_handler.save_results(prediction_results, accuracy, '/content/output/results.txt')

prediction_results, metrics = prediction_handler.evaluate_predictions_three_class(reviews_to_predict)
prediction_handler.save_results(prediction_results, metrics, '/content/output/results_three_class.txt')

Sentiments added


  super()._check_params_vs_input(X, default_n_init=10)


ValueError: n_samples=3 should be >= n_clusters=6.