In [None]:
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer


# **Sentiment Analysis**

In [None]:
!pip install nltk textblob



In [None]:
# Download stopwords from nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab data for English tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Step 2: Load the dataset
# Replace with the path to your reviews CSV file
reviews_file_path = "/content/updated_amazon_reviews.csv"
reviews_df = pd.read_csv(reviews_file_path)

In [None]:
# Check the first few rows of the dataset
print(reviews_df.head())

    productID  overall_rating  \
0  B0DSG8SNXH             4.3   
1  B0DFY3XCB6             4.0   
2  B09TVVGXWS             4.1   
3  B071Z8M4KX             4.1   
4  B0C8JB3G5W             4.2   

                                             review1  \
0  Boult newly launched Z20 truly wireless blueto...   
1  I recently purchased the Samsung Galaxy M05 fo...   
2  I have been a user of OnePlus Bullets wireless...   
3  I recently purchased the boAt BassHeads 100 in...   
4  best wireless earphones for 1600. very sleek l...   

                                             review2  \
0  Good price and These earbuds are really good. ...   
1  All I can say it's worth it considering that P...   
2  There is nothing wrong with the looks of the B...   
3  Good product at this price.\nBuild Quality was...   
4  The OnePlus Buds 2R offer impressive value at ...   

                                             review3  \
0  With their deep bass and crystal-clear highs, ...   
1  I'm thorough

In [None]:
# Step 2: Preprocess the reviews (clean the text)
def clean_text(text):
    """
    This function cleans the review text by:
    - Lowercasing
    - Removing punctuation
    - Tokenizing
    - Removing stopwords
    """
    if isinstance(text, str):  # Ensure the text is a string before processing
        # Convert text to lowercase
        text = text.lower()

        # Remove punctuation
        text = ''.join([char for char in text if char not in string.punctuation])

        # Tokenize the text
        words = word_tokenize(text)

        # Remove stopwords
        stop_words = set(nltk.corpus.stopwords.words('english'))
        cleaned_words = [word for word in words if word not in stop_words]

        return ' '.join(cleaned_words)
    else:
        return ""  # If it's not a string, return an empty string

In [None]:
nltk.download('vader_lexicon')

# Define the sentiment scores function using VADER
def sentiment_scores(sentence):
    senti = SentimentIntensityAnalyzer()
    sentiment_dict = senti.polarity_scores(sentence)  # This line returns the polarity for sentence

    # Classifying sentiment based on the compound score
    if sentiment_dict['compound'] >= 0.05:
        return "Positive", sentiment_dict['pos'], sentiment_dict['compound']
    elif sentiment_dict['compound'] <= -0.05:
        return "Negative", sentiment_dict['neg'], sentiment_dict['compound']
    else:
        return "Neutral", sentiment_dict['neu'], sentiment_dict['compound']

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
# Step 2: Perform Sentiment Analysis for each review
def analyze_product_reviews(product_name, reviews):
    sentiment_counts = {'Positive': 0, 'Neutral': 0, 'Negative': 0}
    for review in reviews:
        if isinstance(review, str) and review.strip():
            sentiment, _, _ = sentiment_scores(review)
            sentiment_counts[sentiment] += 1
    return sentiment_counts


In [None]:
# Step 3: Analyze sentiment for each product
sentiment_results = {}

for product_name in reviews_df['productID'].unique():
    # Get reviews for the current product (columns review1 to review500)
    product_reviews = reviews_df[reviews_df['productID'] == product_name].iloc[:, 2:].values.flatten()

    # Analyze sentiment for the current product
    sentiment_counts = analyze_product_reviews(product_name, product_reviews)

    # Save the results
    sentiment_results[product_name] = sentiment_counts

In [None]:
# Step 4: Display the sentiment results
# Convert results to a DataFrame for better visualization
sentiment_df = pd.DataFrame.from_dict(sentiment_results, orient='index')

# Ensure the DataFrame has the correct columns: Negative, Neutral, Positive
sentiment_df = sentiment_df[['Negative', 'Neutral', 'Positive']]

# Fill NaN values (if any) with 0 (in case a product has no sentiment for a certain category)
sentiment_df = sentiment_df.fillna(0)

# Display the sentiment distribution for each product
print("Sentiment distribution for each product:")
print(sentiment_df)

Sentiment distribution for each product:
            Negative  Neutral  Positive
B0DSG8SNXH         8      126       366
B0DFY3XCB6        48      123       329
B09TVVGXWS        48      108       344
B071Z8M4KX        71       73       356
B0C8JB3G5W        50       91       359
B07WFPL9PB        22       58       420
B0D22QWQHR        28       70       402
B0D3R1JQ7D        62       41       397
B07WHQHNZC        24      110       366
B077BFH786        12       85       403
B098NS6PVG        40       84       376
B0DH3J6LB9        22       85       393
B07R3386PP        28      126       346
B0BDRVFDKP        19       79       402
B0CBTTCJL6        36      114       350
B0DFQ1R3W4        37      114       349
B07WHS99FG        14       61       425
B0D2R2MXXJ        28       59       408
B0DCNWN8NZ        15      100       385
B0D63CNLJ9        16      103       377
B0DSG51QM1         9      116       375
B0D5YCYS1G        48       60       392
B01DEWVZ2C        52       77       371

In [None]:
# Step 5: Save the sentiment analysis results to a CSV file
output_file_path = 'product_sentiment_results_vader.csv'
sentiment_df.to_csv(output_file_path, index=True)

output_file_path

'product_sentiment_results_vader.csv'

# **Aspect based Opinion Mining**

In [None]:
import spacy
# Load the spaCy model for extracting aspects (noun chunks)
nlp = spacy.load("en_core_web_sm")

In [None]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

In [None]:
# Function for extracting aspects (noun chunks) from a review using spaCy
def extract_aspects(review):
    doc = nlp(review)
    aspects = []
    # Extract noun chunks (e.g., "battery life", "camera quality")
    for chunk in doc.noun_chunks:
        aspects.append(chunk.text.lower())  # Store in lowercase for uniformity
    return aspects

In [None]:
# Function to perform sentiment analysis using VADER (already available sentiment scores)
def sentiment_scores(sentence):
    sentiment_dict = sia.polarity_scores(sentence)
    if sentiment_dict['compound'] >= 0.05:
        return "Positive"
    elif sentiment_dict['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [None]:
# Function to perform aspect-based sentiment analysis for a list of reviews
def aspect_based_opinion_mining(reviews):
    aspect_sentiment = {}  # Dictionary to store aspect-based sentiment

    for review in reviews:
        if isinstance(review, str) and review.strip():  # Ensure the review is a valid string and not NaN
            aspects = extract_aspects(review)

            # For each aspect, perform sentiment analysis
            for aspect in aspects:
                sentiment = sentiment_scores(aspect)
                if aspect not in aspect_sentiment:
                    aspect_sentiment[aspect] = {'Positive': 0, 'Neutral': 0, 'Negative': 0}

                # Increment the sentiment count for the aspect
                aspect_sentiment[aspect][sentiment] += 1

    return aspect_sentiment

In [None]:
# Step 1: Perform Aspect-Based Opinion Mining for each product
aspect_sentiment_results = {}

for product_name in reviews_df['productID'].unique():
    # Get reviews for the current product (columns Review_1 to Review_500)
    product_reviews = reviews_df[reviews_df['productID'] == product_name].iloc[:, 2:].values.flatten()

    # Perform Aspect-Based Opinion Mining
    sentiment_counts = aspect_based_opinion_mining(product_reviews)

    # Save the results
    aspect_sentiment_results[product_name] = sentiment_counts


In [None]:
# Step 2: Convert results into a DataFrame for better visualization
aspect_sentiment_df = pd.DataFrame.from_dict(aspect_sentiment_results, orient='index')

# Transpose the DataFrame to have aspects as rows and sentiment categories as columns
aspect_sentiment_df = aspect_sentiment_df.T

# Fill NaN values with 0 (if an aspect has no sentiment for a product)
aspect_sentiment_df = aspect_sentiment_df.fillna(0)

In [None]:
# Step 3: Save the aspect-based sentiment analysis results to a CSV file
output_file_path = 'aspect_based_sentiment_results.csv'
aspect_sentiment_df.to_csv(output_file_path, index=True)

# Print the aspect-based sentiment analysis for each product
# print("Aspect-Based Sentiment Analysis Results:")

for product_name, sentiment_counts in aspect_sentiment_results.items():
    print(f"\nProduct: {product_name}")
    for aspect, counts in sentiment_counts.items():
        print(f"  Aspect: {aspect.capitalize()} - Positive: {counts['Positive']}, Neutral: {counts['Neutral']}, Negative: {counts['Negative']}")

output_file_path  # Return the path of the saved file for downloading



Product: B0DSG8SNXH
  Aspect: Boult - Positive: 0, Neutral: 9, Negative: 0
  Aspect: Z20 - Positive: 0, Neutral: 11, Negative: 0
  Aspect: I - Positive: 0, Neutral: 254, Negative: 0
  Aspect: These earbuds - Positive: 0, Neutral: 47, Negative: 0
  Aspect: The design - Positive: 0, Neutral: 15, Negative: 0
  Aspect: Good & very compact full , very light weight - Positive: 1, Neutral: 0, Negative: 0
  Aspect: Ears - Positive: 0, Neutral: 5, Negative: 0
  Aspect: A long time - Positive: 0, Neutral: 3, Negative: 0
  Aspect: The sound quality - Positive: 0, Neutral: 30, Negative: 0
  Aspect: Movies - Positive: 0, Neutral: 6, Negative: 0
  Aspect: Series - Positive: 0, Neutral: 1, Negative: 0
  Aspect: Noise cancellation - Positive: 0, Neutral: 4, Negative: 0
  Aspect: Mic quality - Positive: 0, Neutral: 2, Negative: 0
  Aspect: Battery life - Positive: 0, Neutral: 24, Negative: 0
  Aspect: Fast charging speed - Positive: 0, Neutral: 1, Negative: 0
  Aspect: Bluetooth connectivity - Positiv

'aspect_based_sentiment_results.csv'

In [None]:
import spacy
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download necessary NLTK resources
nltk.download('vader_lexicon')
nltk.download('punkt')

# Load the spaCy model for aspect extraction (noun chunks)
nlp = spacy.load("en_core_web_sm")

# Function to extract aspects and descriptions from reviews
def apply_extraction(row, nlp):
    """
    This function extracts aspects and their corresponding descriptions from the review
    by applying 7 different rules of POS tagging.
    """
    prod_pronouns = ['it','this','they','these']
    review_body = row['Review']
    doc = nlp(review_body)

    # Initialize lists to store extracted pairs
    rule1_pairs = []
    rule2_pairs = []
    rule3_pairs = []
    rule4_pairs = []
    rule5_pairs = []
    rule6_pairs = []
    rule7_pairs = []

    # Rule 1: Adjective Modifier (AMOD)
    for token in doc:
        A = "999999"
        M = "999999"
        if token.dep_ == "amod" and not token.is_stop:
            M = token.text
            A = token.head.text

            # Add adverbial modifier of adjective (e.g. 'most comfortable headphones')
            M_children = token.children
            for child_m in M_children:
                if(child_m.dep_ == "advmod"):
                    M_hash = child_m.text
                    M = M_hash + " " + M
                    break

            # Negation in adjective, the "no" keyword is a 'det' of the noun (e.g. no interesting characters)
            A_children = token.head.children
            for child_a in A_children:
                if(child_a.dep_ == "det" and child_a.text == 'no'):
                    neg_prefix = 'not'
                    M = neg_prefix + " " + M
                    break

        if(A != "999999" and M != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict1 = {"noun": A, "adj": M, "rule": 1}
            rule1_pairs.append(dict1)

    # Rule 2: Direct Object (DOBJ)
    for token in doc:
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in token.children:
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.text

            if((child.dep_ == "dobj" and child.pos_ == "ADJ") and not child.is_stop):
                M = child.text

            if(child.dep_ == "neg"):
                neg_prefix = child.text
                add_neg_pfx = True

        if (add_neg_pfx and M != "999999"):
            M = neg_prefix + " " + M

        if(A != "999999" and M != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict2 = {"noun": A, "adj": M, "rule": 2}
            rule2_pairs.append(dict2)

    # Rule 3: Adjectival Complement (ACOMP)
    for token in doc:
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in token.children:
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.text

            if(child.dep_ == "acomp" and not child.is_stop):
                M = child.text

            if(child.dep_ == "aux" and child.tag_ == "MD"):
                neg_prefix = "not"
                add_neg_pfx = True

            if(child.dep_ == "neg"):
                neg_prefix = child.text
                add_neg_pfx = True

        if (add_neg_pfx and M != "999999"):
            M = neg_prefix + " " + M

        if(A != "999999" and M != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict3 = {"noun": A, "adj": M, "rule": 3}
            rule3_pairs.append(dict3)

    # Rule 4: Passive Verb (nsubjpass)
    for token in doc:
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in token.children:
            if((child.dep_ == "nsubjpass" or child.dep_ == "nsubj") and not child.is_stop):
                A = child.text

            if(child.dep_ == "advmod" and not child.is_stop):
                M = child.text

            if(child.dep_ == "neg"):
                neg_prefix = child.text
                add_neg_pfx = True

        if (add_neg_pfx and M != "999999"):
            M = neg_prefix + " " + M

        if(A != "999999" and M != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict4 = {"noun": A, "adj": M, "rule": 4}
            rule4_pairs.append(dict4)

    # Rule 5: Complement of Copular Verb (COP)
    for token in doc:
        A = "999999"
        buf_var = "999999"
        for child in token.children:
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.text

            if(child.dep_ == "cop" and not child.is_stop):
                buf_var = child.text

        if(A != "999999" and buf_var != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict5 = {"noun": A, "adj": token.text, "rule": 5}
            rule5_pairs.append(dict5)

    # Rule 6: Interjections (INTJ)
    for token in doc:
        A = "999999"
        M = "999999"
        if(token.pos_ == "INTJ" and not token.is_stop):
            for child in token.children:
                if(child.dep_ == "nsubj" and not child.is_stop):
                    A = child.text
                    M = token.text

        if(A != "999999" and M != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict6 = {"noun": A, "adj": M, "rule": 6}
            rule6_pairs.append(dict6)

    # Rule 7: Attributes (ATTR)
    for token in doc:
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in token.children:
            if(child.dep_ == "nsubj" and not child.is_stop):
                A = child.text

            if((child.dep_ == "attr") and not child.is_stop):
                M = child.text

            if(child.dep_ == "neg"):
                neg_prefix = child.text
                add_neg_pfx = True

        if (add_neg_pfx and M != "999999"):
            M = neg_prefix + " " + M

        if(A != "999999" and M != "999999"):
            if A in prod_pronouns:
                A = "product"
            dict7 = {"noun": A, "adj": M, "rule": 7}
            rule7_pairs.append(dict7)

    aspects = rule1_pairs + rule2_pairs + rule3_pairs + rule4_pairs + rule5_pairs + rule6_pairs + rule7_pairs
    dic = {"aspect_pairs": aspects}
    return dic

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Function to apply the aspect extraction to all reviews in the dataset
def extract_aspects(reviews, nlp):
    """
    Applying the aspect extraction function and returning a dictionary with
    key = Aspect & value = Description
    """
    print("Entering Apply function!")
    # Applying the extraction function to each row in the dataset
    # Create a temporary DataFrame with a 'Review' column
    temp_df = pd.DataFrame({'Review': reviews})
    aspect_list = temp_df.apply(lambda row: apply_extraction(row, nlp), axis=1)
    return aspect_list

In [None]:
# Load your dataset (replace with the actual path to your CSV file)
reviews_file_path = "/content/updated_amazon_reviews.csv"  # Update this path
reviews_df = pd.read_csv(reviews_file_path)


# Combine all review columns into a single text for each product
reviews_columns = [f'review{i}' for i in range(1, 501)]  # Assuming columns are named review1, review2, ..., review500
reviews_df['combined_reviews'] = reviews_df[reviews_columns].apply(lambda row: ' '.join(row.astype(str)), axis=1)


# Apply aspect extraction on the combined reviews column
aspect_extraction_results = extract_aspects(reviews_df['combined_reviews'], nlp)
# Convert results into a DataFrame for better visualization

aspect_extraction_df = pd.DataFrame(aspect_extraction_results)

# Save the result to a CSV file
output_file_path = 'extracted_aspects.csv'
aspect_extraction_df.to_csv(output_file_path, index=False)

output_file_path  # Return the path for downloading


Entering Apply function!


'extracted_aspects.csv'

In [None]:
def add_data(data, aspect_list):
    """
    This function adds aspect and the description to the dataframe such that if one review has multiple aspects,
    then the reviews are repeated such that each row consists of a single aspect and description.
    If no aspect is found, 'neutral' is assigned.
    """

    rev_ = []  # List containing reviews
    aspects_ = []  # List containing aspects
    description_ = []  # List containing descriptions

    for i, j in enumerate(aspect_list):
        length = len(list(j.values())[0])
        if length != 0:
            # Access the review using the index 'i' directly
            rev_a = [data.iloc[i] for k in range(len(list(j.values())[0]))]
            aspects_a = [list(j.values())[0][h]["noun"] for h in range(length)]
            descrip_a = [list(j.values())[0][h]["adj"] for h in range(length)]

            rev_.extend(rev_a)
            aspects_.extend(aspects_a)
            description_.extend(descrip_a)

        # If a review doesn't contain any aspect, add "neutral" for both the aspect & description column
        else:
            # Access the review using the index 'i' directly
            rev_.append(data.iloc[i])
            aspects_.append('neutral')
            description_.append('neutral')

    # Create a DataFrame from the lists (without the 'Date' column)
    data_ = pd.DataFrame({"Review": rev_, "Aspect": aspects_, "Description": description_})
    return data_

In [None]:
# Apply the add_data function
final_data = add_data(reviews_df['combined_reviews'], aspect_extraction_results)

# Save the result to a CSV file
output_file_path = '/content/final_aspect_data.csv'  # Update this path if needed
final_data.to_csv(output_file_path, index=False)

output_file_path  # Return the path for downloading

'/content/final_aspect_data.csv'

In [None]:
final_data

Unnamed: 0,Review,Aspect,Description
0,Boult newly launched Z20 truly wireless blueto...,earbuds,truly wireless
1,Boult newly launched Z20 truly wireless blueto...,weight,good
2,Boult newly launched Z20 truly wireless blueto...,weight,very light
3,Boult newly launched Z20 truly wireless blueto...,weight,durable
4,Boult newly launched Z20 truly wireless blueto...,time,long
...,...,...,...
38373,I recently purchased the Portronics Type A to ...,quality,Value
38374,I recently purchased the Portronics Type A to ...,quality,item
38375,I recently purchased the Portronics Type A to ...,OTG,product
38376,I recently purchased the Portronics Type A to ...,quality,item


In [None]:
# Add a new column 'Usefulness' based on the 'overall_rating' column
reviews_df['Usefulness'] = reviews_df['overall_rating'].apply(lambda x: 'Useful' if x > 4.2 else 'Not Useful')

# Display the updated DataFrame with the 'Usefulness' column
reviews_df[['productID', 'overall_rating', 'Usefulness']].head()  # Display a sample of the result


Unnamed: 0,productID,overall_rating,Usefulness
0,B0DSG8SNXH,4.3,Useful
1,B0DFY3XCB6,4.0,Not Useful
2,B09TVVGXWS,4.1,Not Useful
3,B071Z8M4KX,4.1,Not Useful
4,B0C8JB3G5W,4.2,Not Useful


In [None]:
# Save the result to a CSV file
output_file_path = '/content/final_amazon_ground_truth_dataa.csv'  # Update this path if needed
reviews_df[['productID', 'overall_rating', 'Usefulness']].to_csv(output_file_path, index=False)