# Project Setup

In [None]:
!pip install spacy nltk
#!pip install transformers torch
#!pip install accelerate -U
!python -m spacy download en_core_web_sm

!wget --no-check-certificate 'https://drive.usercontent.google.com/uc?id=1rQeOakOJ6xxIY-q--u3SlSE3g7qlxVTE&authuser=0&export=download' -O combined_dataset.csv
!wget --no-check-certificate 'https://drive.usercontent.google.com/uc?id=1B8zjsBl6Fh8YLgzrITkt6zIllVvbuv56&authuser=0&export=download' -O product_aspects.csv

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import spacy
import json
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
from collections import Counter
import csv

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

nltk.download('punkt')
nltk.download('stopwords')

porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

# DRIVE = 'drive/MyDrive/'
DRIVE = ''
DATASET_NAME = 'combined_dataset.csv'
PRODUCT_ASPECTS_NAME = 'product_aspects.csv'
DATASET_PATH = DRIVE + DATASET_NAME
PRODUCT_ASPECTS_PATH = DRIVE + PRODUCT_ASPECTS_NAME

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
with open(DATASET_PATH, mode='r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    reviews = [dict(row) for row in reader]

with open(PRODUCT_ASPECTS_PATH, mode='r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    product_aspects = [row['product_aspect'] for row in reader]

# Pattern Mining
In a first step, we will mine the patterns of product aspects. Using these patterns, we can then extract the product aspects that match these patterns from every new review. For this, we'll need a sufficiently large list of known product aspects.

In [None]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
def pos_tag(text):
    doc = nlp(text)
    pos_tags = [(chunk.text, chunk.pos_) for chunk in doc.noun_chunks]
    return pos_tags

In [None]:
def stem_nouns(pos_tagged_text):
    stemmed_nouns = []
    for word, tag in pos_tagged_text:
        if tag == 'NOUN':
            stemmed_nouns.append(porter.stem(word))
    return stemmed_nouns

In [None]:
def filter_stop_words(nouns):
    return [word for word in nouns if word.lower() not in stop_words]

In [None]:
def extract_noun_phrases(text):
    doc = nlp(text)
    noun_phrases = [' '.join([porter.stem(token.text) for token in chunk if token.text.lower() not in stop_words])
                    for chunk in doc.noun_chunks]
    return noun_phrases

def extract_noun_phrases(text):
    doc = nlp(text)
    noun_phrases = []

    for chunk in doc.noun_chunks:
        # Filter out any non-noun tokens from the chunk
        noun_only_phrase = ' '.join([porter.stem(token.text) for token in chunk if token.pos_ == 'NOUN' and token.text.lower() not in stop_words])
        if noun_only_phrase:  # Only add non-empty phrases
            noun_phrases.append(noun_only_phrase)

    return noun_phrases


In [None]:
def count_sentences(text):
    doc = nlp(text)
    return len(list(doc.sents))

In [None]:
def find_nearest_adjective(aspect, doc):
    """Find the nearest adjective to the aspect in the sentence."""
    aspect_position = None
    nearest_adjective = None
    distance = float('inf')

    for i, token in enumerate(doc):
        if token.text.lower() == aspect.lower():
            aspect_position = i
            break

    if aspect_position is not None:
        for i, token in enumerate(doc):
            if token.pos_ == 'ADJ' and abs(i - aspect_position) < distance:
                distance = abs(i - aspect_position)
                nearest_adjective = token

    return nearest_adjective

In [None]:
def extract_pattern(aspect, sentence):
    """Extract pattern between the aspect and the nearest adjective."""
    doc = nlp(sentence)
    nearest_adj = find_nearest_adjective(aspect, doc)

    if nearest_adj is None:
        return None, None

    # Find sentence segment between aspect and nearest adjective
    aspect_position = None
    adjective_position = None

    for i, token in enumerate(doc):
        if token.text.lower() == aspect.lower():
            aspect_position = i
        if token == nearest_adj:
            adjective_position = i

    if aspect_position is not None and adjective_position is not None:
        start, end = min(aspect_position, adjective_position), max(aspect_position, adjective_position)
        segment = doc[start:end+1]

        # Generate POS pattern
        pattern = []
        for token in segment:
            if token.text.lower() == aspect.lower():
                pattern.append('_ASP')
            else:
                pattern.append(token.pos_)

        return pattern, (aspect_position, adjective_position)

    return None, None

In [None]:
def mine_patterns(aspects, reviews):
    """Mine POS patterns for given aspects across reviews."""
    pattern_counter = Counter()
    for review in reviews:
        cleaned_text = remove_html_tags(review['review'])
        sentences = [sent.text for sent in nlp(cleaned_text).sents]

        for sentence in sentences:
            for aspect in aspects:
                pattern, positions = extract_pattern(aspect, sentence)
                if pattern:
                    pattern_counter.update([' '.join(pattern)])

        print(f"Review {reviews.index(review) + 1}/{len(reviews)}")

    return pattern_counter

In [None]:
def calculate_pnum(aspects, patterns, reviews):
    aspect_pnum = Counter()

    for review in reviews:
        cleaned_text = remove_html_tags(review['review'])
        sentences = [sent.text for sent in nlp(cleaned_text).sents]

        for sentence in sentences:
            for aspect in aspects:
                pattern, positions = extract_pattern(aspect, sentence)
                if pattern:
                    pattern_str = ' '.join(pattern)
                    if pattern_str in patterns:
                        aspect_pnum[aspect] += 1

        print(f"Review {reviews.index(review) + 1}/{len(reviews)}")

    return aspect_pnum

In [None]:
known_aspects = product_aspects

# Mining patterns from reviews
mined_patterns = mine_patterns(known_aspects, reviews[:1000])

# Minimum support threshold (1%)
total_patterns = sum(mined_patterns.values())
min_support = total_patterns * 0.01

# Filtering patterns by support
frequent_patterns = {pattern: count for pattern, count in mined_patterns.items() if count >= min_support}

# Calculate P_num and apply filter (P_num < 2)
aspect_pnum = calculate_pnum(known_aspects, frequent_patterns, reviews[:1000])
filtered_aspects = [aspect for aspect, pnum in aspect_pnum.items() if pnum >= 2]

print("Frequent POS Patterns with Support >= 1%:")
print(frequent_patterns)
print("Filtered Aspects (P_num >= 2):")
print(filtered_aspects)

Review 1/1000
Review 2/1000
Review 3/1000
Review 4/1000
Review 5/1000
Review 6/1000
Review 7/1000
Review 8/1000
Review 9/1000
Review 10/1000
Review 11/1000
Review 12/1000
Review 13/1000
Review 14/1000
Review 15/1000
Review 16/1000
Review 17/1000
Review 18/1000
Review 19/1000
Review 20/1000
Review 21/1000
Review 22/1000
Review 23/1000
Review 24/1000
Review 25/1000
Review 26/1000
Review 27/1000
Review 28/1000
Review 29/1000
Review 30/1000
Review 31/1000
Review 32/1000
Review 33/1000
Review 34/1000
Review 35/1000
Review 36/1000
Review 37/1000
Review 38/1000
Review 39/1000
Review 40/1000
Review 41/1000
Review 42/1000
Review 43/1000
Review 44/1000
Review 45/1000
Review 46/1000
Review 47/1000
Review 48/1000
Review 49/1000
Review 50/1000
Review 51/1000
Review 52/1000
Review 53/1000
Review 54/1000
Review 55/1000
Review 56/1000
Review 57/1000
Review 58/1000
Review 59/1000
Review 60/1000
Review 61/1000
Review 62/1000
Review 63/1000
Review 64/1000
Review 65/1000
Review 66/1000
Review 67/1000
Revi

In [None]:
import pickle

with open('frequent_patterns.pkl', 'wb') as file:
    pickle.dump(frequent_patterns, file)


# Evaluation

In [None]:
import pickle
DRIVE = 'drive/MyDrive/'
TEST_DATASET_NAME = 'product_aspects_dataset_annotated.json'
MINED_PATTERNS_NAME = 'frequent_patterns.pkl'
PATTERN_PATH = DRIVE + MINED_PATTERNS_NAME
TEST_DATASET_PATH = DRIVE + TEST_DATASET_NAME

In [None]:
def get_frequent_nouns(reviews):
  total_sentences = 0
  noun_phrase_counter = Counter()
  for review in reviews:
    cleaned_text = remove_html_tags(review['review'])
    noun_phrases = extract_noun_phrases(cleaned_text)
    cleaned_noun_phrases = [phrase for phrase in noun_phrases if phrase]
    noun_phrase_counter.update(cleaned_noun_phrases)
    total_sentences += count_sentences(cleaned_text)

  threshold = total_sentences * 0.01
  frequent_noun_phrases = [phrase for phrase, count in noun_phrase_counter.items() if count > threshold]

  return frequent_noun_phrases

In [None]:
def unstem_nouns_in_review(frequent_nouns, review_text):
    review_words = review_text.split()

    noun_map = {}

    for word in review_words:
        stemmed_word = porter.stem(word)

        if stemmed_word in frequent_nouns:
            if stemmed_word not in noun_map:
                noun_map[stemmed_word] = word

    return list(noun_map.values())

In [None]:
def extract_nouns_from_review(review, mined_patterns, nouns_in_review):
    # Clean the review and tokenize it
    cleaned_text = remove_html_tags(review)
    doc = nlp(cleaned_text)
    sentences = [sent.text for sent in doc.sents]

    extracted_aspects = []

    for sentence in sentences:
        doc_sentence = nlp(sentence)

        # Iterate through noun chunks in the sentence
        for chunk in doc_sentence.noun_chunks:
            pos_pattern = []
            aspect_nouns = []

            # Generate the POS pattern and identify aspect nouns
            for token in chunk:
                if token.pos_ == 'NOUN':
                    pos_pattern.append('_ASP')
                    aspect_nouns.append(token.text)
                else:
                    pos_pattern.append(token.pos_)

            pattern_str = ' '.join(pos_pattern)

            # Check if the pattern matches any mined pattern
            if pattern_str in mined_patterns:
                # Join the aspect nouns to handle multi-word nouns
                extracted_aspect = ' '.join(aspect_nouns)

                # Check if the extracted aspect matches any noun in the review noun list
                for noun in nouns_in_review:
                    # Use regex to handle matching with multi-word nouns
                    if re.search(r'\b' + re.escape(extracted_aspect) + r'\b', noun):
                        extracted_aspects.append(extracted_aspect)

    return extracted_aspects

In [None]:
import pandas as pd
import pickle
from collections import Counter
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

def evaluate(review_df, mined_patterns):
    results = {}

    # Step 1: Group reviews by 'parent_asin'
    grouped_reviews = review_df.groupby('parent_asin')

    # Track predictions and ground truths for confusion matrix
    y_true = []
    y_pred = []

    # Step 2: Process each group
    for parent_asin, group in grouped_reviews:
        product_reviews = group.to_dict('records')

        # Get frequent nouns for this subset of reviews
        frequent_nouns = get_frequent_nouns(product_reviews)

        # Step 3: Evaluate each review in the subset
        for review in product_reviews:
              # Get the ground truth aspects
              ground_truth_aspects = review['product_aspects']

              # Handle potential NaN or None cases by defaulting to an empty list
              if not isinstance(ground_truth_aspects, list):
                  ground_truth_aspects = []

              # Extract nouns from review
              unstemmed_nouns = unstem_nouns_in_review(frequent_nouns, review['review'])
              extracted_aspects = extract_nouns_from_review(review['review'], mined_patterns, unstemmed_nouns)

              # Populate ground truth and predictions
              for aspect in unstemmed_nouns:
                  y_true.append(1 if aspect in ground_truth_aspects else 0)
                  y_pred.append(1 if aspect in extracted_aspects else 0)

    # Step 4: Calculate accuracy, recall, precision, and F1-score
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Step 5: Generate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Store the results for this parent_asin
    results = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix
    }

    return results


# Assuming you have loaded the review dataframe and mined patterns
review_df = pd.read_json(TEST_DATASET_PATH)
mined_patterns = pickle.load(open(PATTERN_PATH, 'rb'))

# Run the evaluation
results = evaluate(review_df, mined_patterns)

print(f"Accuracy: {results['accuracy']:.4f}")
print(f"Precision: {results['precision']:.4f}")
print(f"Recall: {results['recall']:.4f}")
print(f"F1 Score: {results['f1_score']:.4f}")
print("Confusion Matrix:")
print(results['confusion_matrix'])
print("\n")


Accuracy: 0.6969
Precision: 0.3091
Recall: 0.4215
F1 Score: 0.3566
Confusion Matrix:
[[372 114]
 [ 70  51]]


