# Project Setup

In [None]:
!pip install spacy nltk
!pip install transformers torch
!pip install accelerate -U
!python -m spacy download en_core_web_sm

!wget --no-check-certificate 'https://drive.usercontent.google.com/uc?id=1rQeOakOJ6xxIY-q--u3SlSE3g7qlxVTE&authuser=0&export=download' -O combined_dataset.csv
!wget --no-check-certificate 'https://drive.usercontent.google.com/uc?id=18tJoEfHHKp8hQaVP0FP9FeOItcm6FWTo&authuser=0&export=download' -O frequent_patterns.pkl

#from google.colab import drive
#drive.mount('/content/drive')

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import spacy
import json
import pandas as pd
import random
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import csv
import pickle
import torch
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')

porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

# DRIVE = 'drive/MyDrive/'
DRIVE = ''
DATASET_NAME = 'combined_dataset.csv'
MINED_PATTERNS_NAME = 'frequent_patterns.pkl'
PATTERN_PATH = DRIVE + MINED_PATTERNS_NAME
DATASET_PATH = DRIVE + DATASET_NAME

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
review_df = pd.read_csv(DATASET_PATH)
mined_patterns = pickle.load(open(PATTERN_PATH, 'rb'))

# Get all reviews for a random product

In [None]:
def get_random_reviews(df: pd.DataFrame):
  # Select a random parent_asin from the DataFrame
  random_parent_asin = random.choice(df['parent_asin'].unique())

  # Filter the DataFrame to include only rows with the selected parent_asin
  filtered_df = df[df['parent_asin'] == random_parent_asin]

  # Convert the filtered DataFrame to the list of dictionaries
  reviews_list = filtered_df.to_dict(orient='records')

  return reviews_list

# Identify frequent nouns in the reviews

In [None]:
def get_frequent_nouns(reviews):
  total_sentences = 0
  noun_phrase_counter = Counter()
  for review in reviews:
    cleaned_text = remove_html_tags(review['review'])
    noun_phrases = extract_noun_phrases(cleaned_text)
    cleaned_noun_phrases = [phrase for phrase in noun_phrases if phrase]
    noun_phrase_counter.update(cleaned_noun_phrases)
    total_sentences += count_sentences(cleaned_text)

  threshold = total_sentences * 0.01
  frequent_noun_phrases = [phrase for phrase, count in noun_phrase_counter.items() if count > threshold]

  return frequent_noun_phrases

def remove_html_tags(text):
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

def pos_tag(text):
  doc = nlp(text)
  pos_tags = [(chunk.text, chunk.pos_) for chunk in doc.noun_chunks]
  return pos_tags

def stem_nouns(pos_tagged_text):
  stemmed_nouns = []
  for word, tag in pos_tagged_text:
    if tag == 'NOUN':
      stemmed_nouns.append(porter.stem(word))
  return stemmed_nouns

def filter_stop_words(nouns):
  return [word for word in nouns if word.lower() not in stop_words]

def extract_noun_phrases(text):
  doc = nlp(text)
  noun_phrases = []

  for chunk in doc.noun_chunks:
    noun_only_phrase = ' '.join([porter.stem(token.text) for token in chunk if token.pos_ == 'NOUN' and token.text.lower() not in stop_words])
    if noun_only_phrase:
      noun_phrases.append(noun_only_phrase)

  return noun_phrases

def count_sentences(text):
  doc = nlp(text)
  return len(list(doc.sents))

# Get the "unstemmed" version of the frequent nouns in a review

In [None]:
def unstem_nouns_in_review(frequent_nouns, review_text):
    review_words = review_text.split()

    noun_map = {}

    for word in review_words:
        stemmed_word = porter.stem(word)

        if stemmed_word in frequent_nouns:
            if stemmed_word not in noun_map:
                noun_map[stemmed_word] = word

    return list(noun_map.values())

# Get "unstemmed" frequent nouns in a review that match the mined patterns for product aspects

In [None]:
def extract_nouns_from_review(review, mined_patterns, nouns_in_review):
    # Clean the review and tokenize it
    cleaned_text = remove_html_tags(review)
    doc = nlp(cleaned_text)
    sentences = [sent.text for sent in doc.sents]

    extracted_aspects = []

    for sentence in sentences:
        doc_sentence = nlp(sentence)

        # Iterate through noun chunks in the sentence
        for chunk in doc_sentence.noun_chunks:
            pos_pattern = []
            aspect_nouns = []

            # Generate the POS pattern and identify aspect nouns
            for token in chunk:
                if token.pos_ == 'NOUN':
                    pos_pattern.append('_ASP')
                    aspect_nouns.append(token.text)
                else:
                    pos_pattern.append(token.pos_)

            pattern_str = ' '.join(pos_pattern)

            # Check if the pattern matches any mined pattern
            if pattern_str in mined_patterns:
                # Join the aspect nouns to handle multi-word nouns
                extracted_aspect = ' '.join(aspect_nouns)

                # Check if the extracted aspect matches any noun in the review noun list
                for noun in nouns_in_review:
                    # Use regex to handle matching with multi-word nouns
                    if re.search(r'\b' + re.escape(extracted_aspect) + r'\b', noun):
                        extracted_aspects.append(extracted_aspect)

    return extracted_aspects

# Get review key points for all extracted product aspects using the QA system

In [None]:
model_checkpoint = 'noahjl/distilbert-base-cased-distilled-squad-finetuned-squad'

tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
model = DistilBertForQuestionAnswering.from_pretrained(model_checkpoint)

def answer_question(context: str, question: str) -> str:
    # Tokenize the input (context and question)
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')

    # Forward pass through the model to get start and end logits
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

    # Get the most likely start and end token positions
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits) + 1

    # Decode the answer from token ids
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index])
    )

    return answer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/597 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [None]:
"""def get_review_key_points(product_reviews):
  frequent_nouns = get_frequent_nouns(product_reviews)
  extracted_review_key_points = []

  for review in product_reviews:
    unstemmed_nouns = unstem_nouns_in_review(frequent_nouns, review['review'])
    extracted_aspects = extract_nouns_from_review(review['review'], mined_patterns, unstemmed_nouns)
    review_key_points = []
    for aspect in extracted_aspects:
      try:
        question = f"What does the reviewer say about the {aspect}?"
        answer = answer_question(review['review'], aspect)
        if "[SEP]" in answer:
          answer = answer.split("[SEP]")[1]
        if answer != "" and "[CLS]" not in answer:
          answer_words = answer.split()
          if len(answer_words) > 1 and len(answer_words) < 10:
            review_key_points.append(answer)
      except:
        continue
    extracted_review_key_points.extend(list(set(review_key_points)))
  return extracted_review_key_points"""

def get_review_key_points(product_reviews):
    frequent_nouns = get_frequent_nouns(product_reviews)
    extracted_reviews = []

    for review in product_reviews:
        unstemmed_nouns = unstem_nouns_in_review(frequent_nouns, review['review'])
        extracted_aspects = extract_nouns_from_review(review['review'], mined_patterns, unstemmed_nouns)
        review_key_points = []

        for aspect in extracted_aspects:
            try:
                question = f"What does the reviewer mention about the {aspect}?"
                answer = answer_question(review['review'], aspect)

                if "[SEP]" in answer:
                    answer = answer.split("[SEP]")[1]

                if answer != "" and "[CLS]" not in answer:
                    answer_words = answer.split()

                    if len(answer_words) > 1 and len(answer_words) < 10:
                        review_key_points.append(answer)

            except Exception as e:
                continue

        # Remove duplicates from the review key points
        review_key_points = list(set(review_key_points))

        extracted_reviews.append({
            "review_text": review['review'],
            "review_key_points": review_key_points,
            "rating": review.get('rating', None)
        })

    return extracted_reviews


# Usage

In [None]:
product_reviews = get_random_reviews(review_df)
"""
frequent_nouns = get_frequent_nouns(product_reviews)
extracted_review_key_points = []

for review in product_reviews:
  unstemmed_nouns = unstem_nouns_in_review(frequent_nouns, review['review'])
  extracted_aspects = extract_nouns_from_review(review['review'], mined_patterns, unstemmed_nouns)
  review_key_points = []
  for aspect in extracted_aspects:
    try:
      question = f"What does the reviewer say about the {aspect}?"
      answer = answer_question(review['review'], aspect)
      if "[SEP]" in answer:
        answer = answer.split("[SEP]")[1]
      if answer != "" and "[CLS]" not in answer:
        answer_words = answer.split()
        if len(answer_words) > 1 and len(answer_words) < 10:
          review_key_points.append(answer)
    except:
      continue
  extracted_review_key_points.extend(list(set(review_key_points)))

extracted_review_key_points
"""
get_review_key_points(product_reviews)

['glass jar',
 'irritated , and itchy skin',
 'skin feeling dry',
 'Good for the eyes and dry skin',
 'First time using snail cream',
 ' Nice moisturizer',
 'dry skin',
 'amazing cream',
 'Nice moisturizer',
 'Sensitive skin',
 'Very nice texture',
 'nice feeling on skin',
 'moisturizers that I have used',
 'cream is completely I scented',
 'product is pretty good',
 'Great moisturizer',
 'Great price',
 'life - saver moisturizer']