# Project Setup

In [None]:
!pip install spacy nltk
!pip install transformers torch
!pip install accelerate -U
!python -m spacy download en_core_web_sm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import spacy
import json
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import csv

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')

porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

DRIVE = 'drive/MyDrive/'
DATASET_NAME = 'combined_dataset.csv'
PRODUCT_ASPECTS_NAME = 'product_aspects.csv'
DATASET_PATH = DRIVE + DATASET_NAME
PRODUCT_ASPECTS_PATH = DRIVE + PRODUCT_ASPECTS_NAME

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
with open(DATASET_PATH, mode='r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    reviews = [dict(row) for row in reader]

with open(PRODUCT_ASPECTS_PATH, mode='r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    product_aspects = [row['product_aspect'] for row in reader]

# Pattern Mining
In a first step, we will mine the patterns of product aspects. Using these patterns, we can then extract the product aspects that match these patterns from every new review. For this, we'll need a sufficiently large list of known product aspects.

In [None]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
def pos_tag(text):
    doc = nlp(text)
    pos_tags = [(chunk.text, chunk.pos_) for chunk in doc.noun_chunks]
    return pos_tags

In [None]:
def stem_nouns(pos_tagged_text):
    stemmed_nouns = []
    for word, tag in pos_tagged_text:
        if tag == 'NOUN':
            stemmed_nouns.append(porter.stem(word))
    return stemmed_nouns

In [None]:
def filter_stop_words(nouns):
    return [word for word in nouns if word.lower() not in stop_words]

In [None]:
def extract_noun_phrases(text):
    doc = nlp(text)
    noun_phrases = [' '.join([porter.stem(token.text) for token in chunk if token.text.lower() not in stop_words])
                    for chunk in doc.noun_chunks]
    return noun_phrases

def extract_noun_phrases(text):
    doc = nlp(text)
    noun_phrases = []

    for chunk in doc.noun_chunks:
        # Filter out any non-noun tokens from the chunk
        noun_only_phrase = ' '.join([porter.stem(token.text) for token in chunk if token.pos_ == 'NOUN' and token.text.lower() not in stop_words])
        if noun_only_phrase:  # Only add non-empty phrases
            noun_phrases.append(noun_only_phrase)

    return noun_phrases


In [None]:
def count_sentences(text):
    doc = nlp(text)
    return len(list(doc.sents))

In [None]:
def find_nearest_adjective(aspect, doc):
    """Find the nearest adjective to the aspect in the sentence."""
    aspect_position = None
    nearest_adjective = None
    distance = float('inf')

    for i, token in enumerate(doc):
        if token.text.lower() == aspect.lower():
            aspect_position = i
            break

    if aspect_position is not None:
        for i, token in enumerate(doc):
            if token.pos_ == 'ADJ' and abs(i - aspect_position) < distance:
                distance = abs(i - aspect_position)
                nearest_adjective = token

    return nearest_adjective

In [None]:
def extract_pattern(aspect, sentence):
    """Extract pattern between the aspect and the nearest adjective."""
    doc = nlp(sentence)
    nearest_adj = find_nearest_adjective(aspect, doc)

    if nearest_adj is None:
        return None, None

    # Find sentence segment between aspect and nearest adjective
    aspect_position = None
    adjective_position = None

    for i, token in enumerate(doc):
        if token.text.lower() == aspect.lower():
            aspect_position = i
        if token == nearest_adj:
            adjective_position = i

    if aspect_position is not None and adjective_position is not None:
        start, end = min(aspect_position, adjective_position), max(aspect_position, adjective_position)
        segment = doc[start:end+1]

        # Generate POS pattern
        pattern = []
        for token in segment:
            if token.text.lower() == aspect.lower():
                pattern.append('_ASP')
            else:
                pattern.append(token.pos_)

        return pattern, (aspect_position, adjective_position)

    return None, None

In [None]:
def mine_patterns(aspects, reviews):
    """Mine POS patterns for given aspects across reviews."""
    pattern_counter = Counter()
    for review in reviews:
        cleaned_text = remove_html_tags(review['review'])
        sentences = [sent.text for sent in nlp(cleaned_text).sents]

        for sentence in sentences:
            for aspect in aspects:
                pattern, positions = extract_pattern(aspect, sentence)
                if pattern:
                    pattern_counter.update([' '.join(pattern)])

        print(f"Review {reviews.index(review) + 1}/{len(reviews)}")

    return pattern_counter

In [None]:
def calculate_pnum(aspects, patterns, reviews):
    aspect_pnum = Counter()

    for review in reviews:
        cleaned_text = remove_html_tags(review['review'])
        sentences = [sent.text for sent in nlp(cleaned_text).sents]

        for sentence in sentences:
            for aspect in aspects:
                pattern, positions = extract_pattern(aspect, sentence)
                if pattern:
                    pattern_str = ' '.join(pattern)
                    if pattern_str in patterns:
                        aspect_pnum[aspect] += 1

        print(f"Review {reviews.index(review) + 1}/{len(reviews)}")

    return aspect_pnum

In [None]:
known_aspects = product_aspects

# Mining patterns from reviews
mined_patterns = mine_patterns(known_aspects, reviews[:1000])

# Minimum support threshold (1%)
total_patterns = sum(mined_patterns.values())
min_support = total_patterns * 0.01

# Filtering patterns by support
frequent_patterns = {pattern: count for pattern, count in mined_patterns.items() if count >= min_support}

# Calculate P_num and apply filter (P_num < 2)
aspect_pnum = calculate_pnum(known_aspects, frequent_patterns, reviews[:1000])
filtered_aspects = [aspect for aspect, pnum in aspect_pnum.items() if pnum >= 2]

print("Frequent POS Patterns with Support >= 1%:")
print(frequent_patterns)
print("Filtered Aspects (P_num >= 2):")
print(filtered_aspects)

Review 1/1000
Review 2/1000
Review 3/1000
Review 4/1000
Review 5/1000
Review 6/1000
Review 7/1000
Review 8/1000
Review 9/1000
Review 10/1000
Review 11/1000
Review 12/1000
Review 13/1000
Review 14/1000
Review 15/1000
Review 16/1000
Review 17/1000
Review 18/1000
Review 19/1000
Review 20/1000
Review 21/1000
Review 22/1000
Review 23/1000
Review 24/1000
Review 25/1000
Review 26/1000
Review 27/1000
Review 28/1000
Review 29/1000
Review 30/1000
Review 31/1000
Review 32/1000
Review 33/1000
Review 34/1000
Review 35/1000
Review 36/1000
Review 37/1000
Review 38/1000
Review 39/1000
Review 40/1000
Review 41/1000
Review 42/1000
Review 43/1000
Review 44/1000
Review 45/1000
Review 46/1000
Review 47/1000
Review 48/1000
Review 49/1000
Review 50/1000
Review 51/1000
Review 52/1000
Review 53/1000
Review 54/1000
Review 55/1000
Review 56/1000
Review 57/1000
Review 58/1000
Review 59/1000
Review 60/1000
Review 61/1000
Review 62/1000
Review 63/1000
Review 64/1000
Review 65/1000
Review 66/1000
Review 67/1000
Revi

In [None]:
import pickle

with open('frequent_patterns.pkl', 'wb') as file:
    pickle.dump(frequent_patterns, file)


# Product Aspect Extraction (using Mined Patterns)

In [None]:
def extract_aspects_from_review(review, mined_patterns):
    """Extract all product aspects in the review that match the mined patterns."""

    # Tokenize the review into sentences
    cleaned_text = remove_html_tags(review)
    doc = nlp(cleaned_text)
    sentences = [sent.text for sent in doc.sents]

    extracted_aspects = []

    for sentence in sentences:
        doc_sentence = nlp(sentence)

        # Extract the POS pattern from the sentence
        for chunk in doc_sentence.noun_chunks:
            pos_pattern = []
            aspect_noun = None

            # Generate the POS pattern and identify aspect placeholder
            for token in chunk:
                if token.pos_ == 'NOUN':
                    pos_pattern.append('_ASP')
                    aspect_noun = token.text
                else:
                    pos_pattern.append(token.pos_)

            pattern_str = ' '.join(pos_pattern)

            # Check if the pattern matches any mined pattern
            if pattern_str in mined_patterns:
                extracted_aspects.append(aspect_noun)

    return extracted_aspects

# Example usage
review_text = "The game has great graphics and the race track is impressive. However, I had a problem with the controls."

# Assuming mined_patterns is the result of the previous pattern mining process
extracted_aspects = extract_aspects_from_review(review_text, frequent_patterns)

print("Extracted Aspects:", extracted_aspects)


Extracted Aspects: ['graphics']


In [None]:
import torch
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

# Load pre-trained DistilBERT tokenizer and model for question answering
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

# Review Key Point Extraction

In [None]:
def answer_question(context: str, question: str) -> str:
    """
    Answers a question given a context using the DistilBERT model.

    Args:
        context (str): The context in which the question should be answered.
        question (str): The question to be answered.

    Returns:
        str: The answer to the question.
    """
    # Tokenize the input (context and question)
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')

    # Forward pass through the model to get start and end logits
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

    # Get the most likely start and end token positions
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits) + 1

    # Decode the answer from token ids
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index])
    )

    return answer

In [None]:
review_key_points = []
for review in reviews[:46]:
  for product_aspect in extract_aspects_from_review(review['reviewText'], frequent_patterns):
    try:
      question = f"What does the reviewer say about the {product_aspect}?"
      answer = answer_question(review['reviewText'], question)
      print(f"Question: {question}")
      print(f"Answer: {answer}")
      if answer != "" and "[CLS]" not in answer:
        review_key_points.append(answer)
    except:
      continue
print(review_key_points)

Question: What does the reviewer say about the game?
Answer: funny game verry good game
Question: What does the reviewer say about the artistry?
Answer: wonderful
Question: What does the reviewer say about the warmming?
Answer: flood the land and resource scarcity has sent human kind to look to the deep ocean for valuable minerals
Question: What does the reviewer say about the kind?
Answer: Trade is about the same . My main beef with this it requires an internet connection . Other than that it has wonderful artistry and graphics . It is the same as anno 1701 but set in a future world where global warmming as flood the land and resource scarcity has sent human kind to look to the deep ocean for valuable minerals . I recoment the deep ocean expansion or complete if you get this . I found the ai instructor a little corny
Question: What does the reviewer say about the minerals?
Answer: set in a future world where global warmming as flood the land and resource scarcity has sent human kind t

# Old

In [None]:
"""for review in reviews[:60]:
    cleaned_text = remove_html_tags(review['review'])
    tagged_text = pos_tag(cleaned_text)
    stemmed_nouns = stem_nouns(tagged_text)
    cleaned_nouns = filter_stop_words(stemmed_nouns)
    print(cleaned_nouns)"""

"for review in reviews[:60]:\n    cleaned_text = remove_html_tags(review['review'])\n    tagged_text = pos_tag(cleaned_text)\n    stemmed_nouns = stem_nouns(tagged_text)\n    cleaned_nouns = filter_stop_words(stemmed_nouns)\n    print(cleaned_nouns)"

In [None]:
"""total_sentences = 0
noun_phrase_counter = Counter()

for review in reviews[:46]:
    cleaned_text = remove_html_tags(review['reviewText'])
    noun_phrases = extract_noun_phrases(cleaned_text)
    cleaned_noun_phrases = [phrase for phrase in noun_phrases if phrase]
    noun_phrase_counter.update(cleaned_noun_phrases)
    total_sentences += count_sentences(cleaned_text)

threshold = total_sentences * 0.01
frequent_noun_phrases = [phrase for phrase, count in noun_phrase_counter.items() if count > threshold]

print("Total sentences:", total_sentences)
print("Frequent noun phrases:")
print(frequent_noun_phrases)"""

'total_sentences = 0\nnoun_phrase_counter = Counter()\n\nfor review in reviews[:46]:\n    cleaned_text = remove_html_tags(review[\'reviewText\'])\n    noun_phrases = extract_noun_phrases(cleaned_text)\n    cleaned_noun_phrases = [phrase for phrase in noun_phrases if phrase]\n    noun_phrase_counter.update(cleaned_noun_phrases)\n    total_sentences += count_sentences(cleaned_text)\n\nthreshold = total_sentences * 0.01\nfrequent_noun_phrases = [phrase for phrase, count in noun_phrase_counter.items() if count > threshold]\n\nprint("Total sentences:", total_sentences)\nprint("Frequent noun phrases:")\nprint(frequent_noun_phrases)'