In [1]:
import pandas as pd
import numpy as np
import re
import nltk
#Preprocessing: tokenization and lemmatization
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sent_tokenizer = PunktSentenceTokenizer()

#Sentiment Analysis with VADER
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abdar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
reviews_df = pd.read_csv('./data/reviews.csv')

In [3]:
def sentiment_preprocessor(raw_text, lowercase=True, leave_punctuation = False, lemmatization=True, tokenized_output=True, sentence_output=True):
    

    # Convert to lowercase if specified
    if lowercase:
        clean_text = raw_text.lower()
    else:
        clean_text = raw_text
    
    # Remove newline characters
    clean_text = re.sub(r'(\*|\\n|\\r|\\t|</?ul>|</?li>)', ' ', clean_text)

    # Remove punctuation if specified
    if not leave_punctuation:
        clean_text = re.sub(r'(\W)', ' ', clean_text)

    # Remove URLs
    clean_text = re.sub(r'(http\S+|www\S+)', ' ', clean_text)

    # Remove isolated consonants
    clean_text = re.sub(r'\b([^aeiou\s])\b', ' ', clean_text)

    # Tokenize
    clean_text = word_tokenize(clean_text)

    # Lemmatize if specified
    if lemmatization:
        clean_text = [lemmatizer.lemmatize(token, pos='v') for token in clean_text]

    # Re-join if tokenized output is not requested
    if not tokenized_output:
        clean_text = " ".join(clean_text)
        # Remove space before punctuation
        clean_text = re.sub(r'(\s)(?!\w)', '', clean_text)

    # Join sentences into a single string if specified
    if sentence_output and not tokenized_output:
        clean_text = " ".join(sent_tokenize(clean_text))

    return clean_text

In [4]:
reviews_df['CleanReview'] = reviews_df['Review'].apply(lambda review: sentiment_preprocessor(
    review, lowercase = False, 
    leave_punctuation = True, 
    lemmatization=False, 
    tokenized_output=False))

In [5]:
vader = SentimentIntensityAnalyzer()

In [6]:
# Analyse polarity and add results to dataframe
reviews_df['Vader'] = reviews_df['CleanReview'].apply(lambda x: vader.polarity_scores(x))
reviews_df['Negative_vader'] = reviews_df['Vader'].apply(lambda x: x['neg'])
reviews_df['Neutral_vader'] = reviews_df['Vader'].apply(lambda x: x['neu'])
reviews_df['Positive_vader'] = reviews_df['Vader'].apply(lambda x: x['pos'])
reviews_df['Compound_vader'] = reviews_df['Vader'].apply(lambda x: x['compound'])

In [7]:
# Drop column with polarity scores
reviews_df.drop('Vader', axis=1, inplace=True)

In [8]:
# Name of the columns related with vader
vader_cols = ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader']

In [9]:
reviews_df[vader_cols].describe()

Unnamed: 0,Negative_vader,Neutral_vader,Positive_vader,Compound_vader
count,587.0,587.0,587.0,587.0
mean,0.008245,0.859063,0.131,0.469345
std,0.01445,0.152677,0.141421,0.481055
min,0.0,0.0,0.0,-0.5267
25%,0.0,0.719,0.0,0.0
50%,0.0,0.947,0.0,0.0
75%,0.018,1.0,0.2605,0.96875
max,0.094,1.0,0.677,0.9968


In [10]:
# Define features and target variable
X = reviews_df[['OverallRating', 'CleanReview'] + vader_cols]
y = reviews_df['Rating']

In [11]:
# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
# Create a ColumnTransformer to handle different feature types
# In this case, we'll use TF-IDF for text (customer reviews) and passthrough for other variables
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'CleanReview'),
        ('numeric', 'passthrough', ['OverallRating', 'Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader'])
    ])

In [13]:
# Combine the preprocessor with a regressor (RandomForestRegressor in this case)
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [14]:
# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [15]:
# Fit the model on training data
pipeline.fit(X_train, y_train)

# Make predictions on validation set
y_val_pred = pipeline.predict(X_val)

# Evaluate the model on validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Root Mean Squared Error on Validation Set: {rmse_val}')

# Make predictions on test set
y_test_pred = pipeline.predict(X_test)

# Evaluate the model on test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'Root Mean Squared Error on Test Set: {rmse_test}')