In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Cell 1: Libraries and Packages
# Install required packages
!pip install huggingface_hub[hf_xet] rouge-score pandas numpy matplotlib seaborn wordcloud spacy nltk scikit-learn transformers torch tensorflow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import spacy
from spacy import displacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Transformers and deep learning libraries
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                         pipeline, RobertaTokenizer, RobertaForQuestionAnswering)
from transformers import logging as transformers_logging
from huggingface_hub import hf_hub_download, login  # Added hf_xet functionality
transformers_logging.set_verbosity_error()

# For summarization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (Embedding, GRU, Dense, Attention,
                                   Input, Bidirectional, Concatenate)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Import rouge_score
from rouge_score import rouge_scorer

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_lg")
except:
    !python -m spacy download en_core_web_lg
    nlp = spacy.load("en_core_web_lg")

# Optional: Login to Hugging Face Hub if needed
# from huggingface_hub import notebook_login
# notebook_login()

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-non

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# Cell 2: Data Processing
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/special project/Reviews.csv')
print(f"Dataset shape: {df.shape}")
print(df.head())

# Basic preprocessing
df = df.dropna(subset=['Text', 'Score'])  # Remove rows with missing text or score
df = df[df['Text'].str.len() > 20]  # Remove very short reviews
df = df.sample(5000, random_state=42)

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char.isalpha() or char == ' '])
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['Cleaned_Text'] = df['Text'].apply(preprocess_text)
print("Text preprocessing completed.")

Dataset shape: (568454, 10)
   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of t

In [4]:
# Cell 3: Sentiment Analysis
from tqdm import tqdm
tqdm.pandas()

# Initialize DistilBERT for sentiment analysis
sentiment_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)

# Function to get sentiment using DistilBERT
def get_sentiment_distilbert(text):
    try:
        result = sentiment_pipeline(text[:512])[0]  # Truncate to 512 tokens
        if result['label'] == 'POSITIVE':
            return result['score']
        else:
            return -result['score']
    except:
        return 0

# Apply sentiment analysis (sample for speed)
sample_size = 2000  # Reduce for faster processing
sentiment_sample = df.sample(sample_size, random_state=42)
sentiment_sample['Sentiment_DistilBERT'] = sentiment_sample['Text'].progress_apply(get_sentiment_distilbert)

# Classify sentiment based on DistilBERT output
sentiment_sample['Sentiment_Label'] = sentiment_sample['Sentiment_DistilBERT'].apply(
    lambda x: 'Positive' if x > 0.33 else ('Negative' if x < -0.33 else 'Neutral'))

# Plot sentiment distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=sentiment_sample, x='Sentiment_Label', order=['Positive', 'Neutral', 'Negative'])
plt.title('Distribution of Sentiment Labels (DistilBERT)')
plt.savefig('sentiment_distribution_distilbert.png')
plt.close()
print("Sentiment analysis completed with DistilBERT.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 2000/2000 [07:07<00:00,  4.68it/s]


Sentiment analysis completed with DistilBERT.


In [5]:
from transformers import pipeline
from collections import defaultdict
import plotly.graph_objects as go
import threading

# Load sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis",
                              model="distilbert-base-uncased-finetuned-sst-2-english")

def analyze_review_sentiment(review_text):
    try:
        result = sentiment_analyzer(review_text[:512])[0]

        if result['label'] == 'POSITIVE':
            if result['score'] > 0.66:
                sentiment = "Positive"
            elif result['score'] > 0.33:
                sentiment = "Slightly Positive"
            else:
                sentiment = "Neutral"
        else:  # NEGATIVE
            if result['score'] > 0.66:
                sentiment = "Negative"
            elif result['score'] > 0.33:
                sentiment = "Slightly Negative"
            else:
                sentiment = "Neutral"

        return {
            "review": review_text,
            "sentiment": sentiment,
            "confidence": result['score'],
            "raw_label": result['label']
        }

    except Exception as e:
        return {
            "error": str(e),
            "review": review_text,
            "sentiment": "Analysis Failed"
        }

def plot_sentiment_distribution(reviews_data):
    sentiment_counts = defaultdict(int)

    for review in reviews_data:
        if 'sentiment' in review:
            sentiment_counts[review['sentiment']] += 1

    if not sentiment_counts:
        print("No valid sentiment data to plot.")
        return

    labels = list(sentiment_counts.keys())
    values = list(sentiment_counts.values())

    color_map = {
        'Positive': 'yellow',
        'Slightly Positive': 'limegreen',
        'Neutral': 'gold',
        'Slightly Negative': 'lightcoral',
        'Negative': 'violet',
    }
    colors = [color_map.get(label, 'silver') for label in labels]

    fig = go.Figure(data=[go.Pie(
        labels=labels,
        values=values,
        marker=dict(colors=colors),
        hoverinfo='label+value+percent',
        textinfo='percent',
        insidetextorientation='radial'
    )])

    fig.update_layout(
        title='Sentiment Distribution of Reviews',
        showlegend=True,
        width=400,   # 👈 Adjust width
        height=400
      # 👈 Adjust height
    )


    fig.show()

def threaded_plot(reviews_data):
    thread = threading.Thread(target=plot_sentiment_distribution, args=(reviews_data,))
    thread.start()

# Interactive loop
if __name__ == "__main__":
    print("Sentiment Analyzer is ready! Type 'exit' to quit or 'plot' to view distribution.\n")

    all_reviews = []

    while True:
        user_review = input("Enter the customer review to analyze: ").strip()

        if user_review.lower() in ["exit", "quit"]:
            print("Exiting Sentiment Analyzer. Goodbye!")
            break
        elif user_review.lower() == "plot":
            if all_reviews:
                plot_sentiment_distribution(all_reviews)
                plt.show(block=False)  # Non-blocking show
                plt.pause(0.1)  # Brief pause to let the window appear
            else:
                print("No reviews analyzed yet. Please enter some reviews first.")
            continue

        analysis = analyze_review_sentiment(user_review)
        all_reviews.append(analysis)

        print("\nSentiment Analysis Results:")
        print(f"Review: {analysis['review']}")
        if 'error' in analysis:
            print(f"Error: {analysis['error']}")
        else:
            print(f"Sentiment: {analysis['sentiment']}")
            print(f"Confidence: {analysis['confidence']:.2f}")
            print(f"Raw Label: {analysis['raw_label']}")
        print("-" * 50 + "\n")


Sentiment Analyzer is ready! Type 'exit' to quit or 'plot' to view distribution.

Enter the customer review to analyze: exit
Exiting Sentiment Analyzer. Goodbye!


In [6]:
# Cell 4: Information Extraction
# Information Extraction with spaCy
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply NER to a sample of reviews
ner_sample = df.sample(500, random_state=42)
ner_sample['Entities'] = ner_sample['Text'].progress_apply(extract_entities)

# Analyze most common entities
all_entities = [item for sublist in ner_sample['Entities'] for item in sublist]
entity_df = pd.DataFrame(all_entities, columns=['Entity', 'Type'])
top_entities = entity_df['Type'].value_counts().head(5)

plt.figure(figsize=(10, 6))
top_entities.plot(kind='bar')
plt.title('Top 5 Entity Types Extracted')
plt.savefig('top_entity_types.png')
plt.close()

# Initialize RoBERTa for question answering
qa_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")
qa_model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

def answer_question(context, question):
    inputs = qa_tokenizer(question, context, return_tensors="pt", truncation=True)
    outputs = qa_model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = qa_tokenizer.convert_tokens_to_string(
        qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    return answer

# Example QA on reviews
sample_review = df.iloc[0]['Text']
question = "What is the product quality like?"
answer = answer_question(sample_review, question)
print(f"\nQuestion: {question}")
print(f"Answer: {answer}")

100%|██████████| 500/500 [00:13<00:00, 36.32it/s]


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]


Question: What is the product quality like?
Answer: <s>


In [7]:
# # 1. FIRST RUN THIS CELL TO SETUP ENVIRONMENT
# from google.colab import output
# output.enable_custom_widget_manager()
# !pip install -q wordcloud spacy ipywidgets
# !python -m spacy download en_core_web_sm
# print("Environment setup complete! Proceed to the next cell.")

spaCy's default entities focus on:

People (PERSON)

Organizations (ORG)

Locations (LOC)

Dates (DATE)

Numbers (CARDINAL)

Not generic product terms

In [8]:
# word cloud generator
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from IPython.display import display, clear_output
import ipywidgets as widgets

# Load spaCy's English model
nlp = spacy.load("en_core_web_lg")

# Create widgets
review_input = widgets.Textarea(
    placeholder='Enter your review here...',
    description='Review:',
    layout=widgets.Layout(width='80%', height='100px')
)

analyze_btn = widgets.Button(description="Analyze")
clear_btn = widgets.Button(description="Clear")
output_area = widgets.Output()

def analyze_review(b):
    with output_area:
        clear_output(wait=True)
        review_text = review_input.value.strip()

        if not review_text:
            print("Please enter a valid review.")
            return

        print(f"Analyzing review: {review_text[:50]}...\n")

        doc = nlp(review_text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        if not entities:
            print("No entities found in this review.")
            return

        # Create visualizations
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))

        # Pie chart
        entity_df = pd.DataFrame(entities, columns=['Entity', 'Type'])
        top_entities = entity_df['Type'].value_counts()
        ax1.pie(top_entities, labels=top_entities.index, autopct='%1.1f%%', startangle=90)
        ax1.set_title('Distribution of Entity Types')

        # Word cloud
        wordcloud = WordCloud(width=600, height=400,
                            background_color='white',
                            colormap='viridis').generate(' '.join([ent[0] for ent in entities]))
        ax2.imshow(wordcloud, interpolation='bilinear')
        ax2.axis('off')
        ax2.set_title('Entity Word Cloud')

        plt.tight_layout()
        plt.show()

        # Display entity table
        print("\nExtracted Entities:")
        display(entity_df.style.set_caption("Detailed Entity Information").set_table_styles([
            {'selector': 'caption', 'props': [('font-size', '16px'), ('font-weight', 'bold')]}
        ]))

def clear_all(b):
    review_input.value = ""
    with output_area:
        clear_output()

# Set up event handlers
analyze_btn.on_click(analyze_review)
clear_btn.on_click(clear_all)

# Display the interface
display(widgets.VBox([
    widgets.HBox([review_input]),
    widgets.HBox([analyze_btn, clear_btn]),
    output_area
]))

print("Enter a review and click 'Analyze'. Click 'Clear' to start over.")





VBox(children=(HBox(children=(Textarea(value='', description='Review:', layout=Layout(height='100px', width='8…

Enter a review and click 'Analyze'. Click 'Clear' to start over.


In [9]:
# Cell 5: Text Summarization
# Text Summarization with GRU and Attention
def prepare_summarization_data(texts, max_vocab_size=10000, max_length=100):
    tokenizer = Tokenizer(num_words=max_vocab_size)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return tokenizer, padded_sequences

# Prepare sample data for summarization
summarization_sample = df.sample(100, random_state=42)['Text'].tolist()
tokenizer, X_summary = prepare_summarization_data(summarization_sample)

# Define GRU with Attention model
def build_summarization_model(vocab_size, embedding_dim=256, gru_units=128):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_gru = Bidirectional(GRU(gru_units, return_sequences=True, return_state=True))
    encoder_outputs, state_h, state_c = encoder_gru(encoder_embedding)

    encoder_states = [state_h, state_c]

    # Decoder with attention
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_gru = GRU(gru_units * 2, return_sequences=True)
    decoder_outputs = decoder_gru(decoder_embedding, initial_state=encoder_states[0])

    attention = Attention()([decoder_outputs, encoder_outputs])
    decoder_concat = Concatenate()([decoder_outputs, attention])

    decoder_dense = Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_concat)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy')
    return model

# Build and train summarization model (simplified example)
vocab_size = len(tokenizer.word_index) + 1
summary_model = build_summarization_model(vocab_size)

print("Summarization model architecture created.")

Summarization model architecture created.


In [10]:
# Cell 6: Topic Modeling and Clustering
# TF-IDF Vectorization for traditional analysis
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
tfidf = tfidf_vectorizer.fit_transform(df['Cleaned_Text'])

# LDA for Topic Modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf)

# Display topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[f"Topic {topic_idx+1}"] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topics

feature_names = tfidf_vectorizer.get_feature_names_out()
topics = display_topics(lda, feature_names, 10)

# Print topics
print("\nDiscovered Topics:")
for topic, words in topics.items():
    print(f"{topic}: {', '.join(words)}")

# Assign dominant topic to each review
topic_results = lda.transform(tfidf)
df['Dominant_Topic'] = topic_results.argmax(axis=1)

# K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(tfidf)
df['Cluster'] = clusters


Discovered Topics:
Topic 1: cooky, store, popcorn, amazon, find, chip, great, love, product, best
Topic 2: dog, food, cat, treat, love, product, one, br, like, get
Topic 3: coffee, tea, cup, flavor, taste, like, good, one, drink, strong
Topic 4: taste, like, br, tea, flavor, good, sugar, great, bar, love
Topic 5: product, great, amazon, price, shipping, candy, order, gift, good, arrived


In [13]:
# Cell 7: Simplified Power BI Dashboard Preparation
def prepare_powerbi_data(df, sentiment_sample):
    # Select essential columns and merge sentiment data
    dashboard_data = df[['Id', 'ProductId', 'UserId', 'Score', 'Time', 'Text','Cleaned_Text' ]].copy()

    # Convert Unix timestamp to datetime
    dashboard_data['ReviewDate'] = pd.to_datetime(dashboard_data['Time'], unit='s')

    # Add key metrics
    dashboard_data['ReviewLength'] = dashboard_data['Text'].str.len()
    dashboard_data['WordCount'] = dashboard_data['Text'].str.split().str.len()

    # Merge sentiment analysis results
    sentiment_sample = sentiment_sample[['Sentiment_Label', 'Sentiment_DistilBERT']]
    dashboard_data = dashboard_data.merge(
        sentiment_sample,
        left_index=True,
        right_index=True,
        how='left'
    )

    # Add topic information
    dashboard_data['DominantTopic'] = df['Dominant_Topic']
    topic_names = {
        0: "Snacks & Grocery",
        1: "Pet Food & Supplies",
        2: "Coffee & Tea",
        3: "Food Taste & Flavor",
        4: "Product Quality"
    }
    dashboard_data['Topic'] = dashboard_data['DominantTopic'].map(topic_names)

    # Add cluster information
    dashboard_data['Cluster'] = df['Cluster']
    cluster_names = {
        0: "Positive Experience",
        1: "Mixed Reviews",
        2: "Negative Feedback"
    }
    dashboard_data['ReviewType'] = dashboard_data['Cluster'].map(cluster_names)

    # Calculate helpfulness ratio (modified)
    if 'HelpfulnessNumerator' in df.columns and 'HelpfulnessDenominator' in df.columns:
        dashboard_data['HelpfulnessRatio'] = (
            df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
        ).replace([np.inf, np.nan], 0)
    else:
        # If the columns are not present, create a placeholder 'HelpfulnessRatio' column with default value 0
        dashboard_data['HelpfulnessRatio'] = 0

    # Add time-based features
    dashboard_data['ReviewYear'] = dashboard_data['ReviewDate'].dt.year
    dashboard_data['ReviewMonth'] = dashboard_data['ReviewDate'].dt.month_name()

    return dashboard_data[['Id', 'ProductId', 'UserId', 'Score', 'ReviewDate',
                         'ReviewLength', 'WordCount', 'Sentiment_Label',
                         'Sentiment_DistilBERT', 'Topic', 'ReviewType','Cluster','Cleaned_Text',
                         'HelpfulnessRatio', 'ReviewYear', 'ReviewMonth']]

# Generate dashboard data
powerbi_data = prepare_powerbi_data(df, sentiment_sample)

# Save to CSV
powerbi_data.to_csv('amazon_reviews_powerbi.csv', index=False)
print("Power BI data prepared with columns:", powerbi_data.columns.tolist())

Power BI data prepared with columns: ['Id', 'ProductId', 'UserId', 'Score', 'ReviewDate', 'ReviewLength', 'WordCount', 'Sentiment_Label', 'Sentiment_DistilBERT', 'Topic', 'ReviewType', 'Cluster', 'Cleaned_Text', 'HelpfulnessRatio', 'ReviewYear', 'ReviewMonth']
