In [34]:
import pandas as pd
import swifter
import numpy as np
import time

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler


from transformers import pipeline
from transformers import BertTokenizer





from sklearn.ensemble import RandomForestClassifier


sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [12, 5]


In [11]:
df = pd.read_excel("./Data/YelpReviews.xlsx")

# Text processing

In [12]:
df['review_text'] = df['review_text'].str.strip('"') # Remove the quotes from the review text

df['text_character_length'] = df['review_text'].str.len() # Calculate the length of the review text

df['text_word_count'] = df['review_text'].str.split().str.len() # Calculate the word count of the review text

In [44]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


Tokens are the smallest units of text that the model can process. They can be words, parts of words, or punctuation marks. For example, the word "tokenization" might be split into "token" and "##ization". It's how AI processes texts.


In [36]:
def count_tokens(text):
    return len(tokenizer.tokenize(text))

df['token_count'] = df['review_text'].apply(count_tokens) 

In [81]:
def analyze_long_text(text, chunk_size=512):
    # Tokenize the text
    tokens = sentiment_pipeline.tokenizer.tokenize(text)
    
    # Split into chunks
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    
    # Analyze each chunk
    results = []
    
    for chunk in chunks:
        chunk_text = sentiment_pipeline.tokenizer.convert_tokens_to_string(chunk)
        result = sentiment_pipeline(chunk_text)[0]
        results.append(result)
    

    # Aggregate results
    pos_scores = [r['score'] for r in results if r['label'] == 'positive']
    neg_scores = [r['score'] for r in results if r['label'] == 'negative']
    neu_scores = [r['score'] for r in results if r['label'] == 'neutral']

    
    pos_avg = sum(pos_scores) / len(pos_scores) if pos_scores else 0
    neg_avg = sum(neg_scores) / len(neg_scores) if neg_scores else 0
    neu_avg = sum(neu_scores) / len(neu_scores) if neu_scores else 0
    
    # Determine the dominant sentiment
    if pos_avg > neg_avg and pos_avg > neu_avg:
        return {"label": "positive", "score": pos_avg}
    elif neg_avg > pos_avg and neg_avg > neu_avg:
        return {"label": "negative", "score": neg_avg}
    else:
        return {"label": "neutral", "score": neu_avg}

In [85]:
sentiment_results = df.head(100)['review_text'].apply(lambda x: analyze_long_text(x))
sentiment_results
sentiment_scores = [result['score'] for result in sentiment_results]

scaler = MinMaxScaler(feature_range=(0, 5))
scaled_scores = scaler.fit_transform(np.array(sentiment_scores).reshape(-1, 1)).flatten()


In [86]:
scaled_scores

array([4.95190664, 3.85735126, 1.51922227, 4.96481957, 2.06838219,
       4.76908126, 4.82740587, 1.80018865, 4.91906629, 4.58837951,
       4.991626  , 4.5708714 , 4.84628669, 4.47354908, 2.90531688,
       4.30898103, 4.80167933, 4.97980848, 4.94622006, 4.73906554,
       4.95177919, 2.77305239, 4.80755595, 4.80813121, 4.67604412,
       2.87974708, 2.33921147, 4.98449324, 2.78692121, 2.91878555,
       4.87393133, 4.9300944 , 2.74927956, 4.95803471, 2.42187326,
       4.90947747, 2.06683209, 4.90559475, 4.93129659, 4.62973395,
       4.63044412, 4.84849587, 1.05208631, 4.8886395 , 4.67932115,
       4.82213782, 4.73737937, 2.44178289, 5.        , 3.38818925,
       4.59366422, 4.78785702, 4.91370351, 4.94624016, 4.87841801,
       4.09943705, 1.41233124, 4.93303386, 2.21821852, 4.87527073,
       4.83094412, 3.98005104, 1.9813405 , 0.28253744, 2.17975245,
       4.69444728, 4.74267613, 4.4788688 , 4.83164855, 3.9661627 ,
       4.49953339, 4.28522543, 4.59744417, 1.69211236, 4.26376

In [87]:
sentiment_results

0     {'label': 'positive', 'score': 0.9849315881729...
1     {'label': 'positive', 'score': 0.8712942004203...
2      {'label': 'neutral', 'score': 0.628548264503479}
3     {'label': 'positive', 'score': 0.9862722158432...
4     {'label': 'neutral', 'score': 0.6855623722076416}
                            ...                        
95    {'label': 'positive', 'score': 0.9829940795898...
96    {'label': 'positive', 'score': 0.9536482095718...
97    {'label': 'negative', 'score': 0.5795754194259...
98    {'label': 'positive', 'score': 0.7495265603065...
99    {'label': 'positive', 'score': 0.9830553531646...
Name: review_text, Length: 100, dtype: object