In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anithasmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anithasmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anithasmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize stopwords and lemmatizer once
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def clean_text(text):
    """
    Clean text by removing URLs, emails, phone numbers, special characters, and digits.
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+|www\S+|\S+@\S+|\d{3}[-.\s]??\d{3}[-.\s]??\d{4}', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip()
    return text

In [5]:
def preprocess_text(text):
    """
    Tokenize, remove stopwords, and lemmatize the text.
    """
    text = clean_text(text)
    tokens = word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(token) for token in tokens if token not in stop_words)

In [6]:
def vader_sentiment_analysis(texts):
    """
    Apply VADER sentiment analysis to texts.
    """
    analyzer = SentimentIntensityAnalyzer()
    return pd.DataFrame([analyzer.polarity_scores(str(text)) for text in texts])

In [7]:
def bert_sentiment_analysis(texts, batch_size=32):
    """
    Apply BERT sentiment analysis to texts.
    """
    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment").to("cuda" if torch.cuda.is_available() else "cpu")
    
    cleaned_texts = [clean_text(text) for text in texts if clean_text(text)]
    results = []
    for i in tqdm(range(0, len(cleaned_texts), batch_size), desc="BERT Analysis"):
        inputs = tokenizer(cleaned_texts[i:i + batch_size], padding=True, truncation=True, return_tensors="pt", max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        results.extend(torch.softmax(outputs.logits, dim=1).cpu().numpy())
    
    return pd.DataFrame(results, columns=['score_1', 'score_2', 'score_3', 'score_4', 'score_5'])

In [8]:
def main(df):
    """
    Main function to preprocess text and perform sentiment analysis.
    """
    print("0. Initial data cleaning...")
    df = df.dropna(subset=['review_content'])
    
    print("\n1. Preprocessing text...")
    df['normalized_review'] = df['review_content'].apply(preprocess_text)
    
    print("\n2. Performing VADER sentiment analysis...")
    vader_results = vader_sentiment_analysis(df['review_content'])
    
    print("\n3. Performing BERT sentiment analysis...")
    bert_results = bert_sentiment_analysis(df['review_content'])
    
    return vader_results, bert_results

In [9]:
# Load DataFrame
df = pd.read_csv('/Users/anithasmac/Projects/CustomerJourneyMapping/Featured_Amazon_Data.csv')

In [10]:
# Run the analysis
vader_results, bert_results = main(df)

0. Initial data cleaning...

1. Preprocessing text...

2. Performing VADER sentiment analysis...

3. Performing BERT sentiment analysis...


BERT Analysis: 100%|████████████████████████████| 46/46 [16:45<00:00, 21.85s/it]


In [11]:
vader_results

Unnamed: 0,neg,neu,pos,compound
0,0.031,0.754,0.215,0.8974
1,0.010,0.830,0.160,0.9853
2,0.000,0.651,0.349,0.7089
3,0.027,0.807,0.165,0.8316
4,0.084,0.725,0.191,0.9955
...,...,...,...,...
1460,0.044,0.742,0.214,0.9390
1461,0.090,0.736,0.174,0.9473
1462,0.081,0.727,0.192,0.9682
1463,0.060,0.884,0.056,-0.2960


In [12]:
bert_results

Unnamed: 0,score_1,score_2,score_3,score_4,score_5
0,0.009876,0.031780,0.178236,0.496324,0.283783
1,0.054104,0.167403,0.299809,0.345163,0.133522
2,0.006395,0.039326,0.354691,0.526653,0.072936
3,0.050870,0.163608,0.407641,0.312520,0.065362
4,0.084407,0.286777,0.384614,0.201704,0.042498
...,...,...,...,...,...
1460,0.029028,0.214520,0.611890,0.133024,0.011538
1461,0.057944,0.260241,0.534737,0.133147,0.013931
1462,0.118826,0.333181,0.372364,0.146828,0.028802
1463,0.021912,0.143006,0.553718,0.254252,0.027111
