In [1]:
import pandas as pd
import numpy as np
import nltk
from data_utils import sentiment_preprocessor
#Preprocessing: tokenization and lemmatization
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sent_tokenizer = PunktSentenceTokenizer()

#Sentiment Analysis with VADER
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abdar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
reviews_df = pd.read_csv('./data/reviews.csv')

In [3]:
reviews_df['CleanReview'] = reviews_df['Review'].apply(lambda review: sentiment_preprocessor(
    review, lowercase = False, 
    leave_punctuation = True, 
    lemmatization=False, 
    tokenized_output=False))

In [4]:
vader = SentimentIntensityAnalyzer()

In [5]:
# Analyse polarity and add results to dataframe
reviews_df['Vader'] = reviews_df['CleanReview'].apply(lambda x: vader.polarity_scores(x))
reviews_df['Negative_vader'] = reviews_df['Vader'].apply(lambda x: x['neg'])
reviews_df['Neutral_vader'] = reviews_df['Vader'].apply(lambda x: x['neu'])
reviews_df['Positive_vader'] = reviews_df['Vader'].apply(lambda x: x['pos'])
reviews_df['Compound_vader'] = reviews_df['Vader'].apply(lambda x: x['compound'])

In [6]:
# Drop column with polarity scores
reviews_df.drop('Vader', axis=1, inplace=True)

In [7]:
# Name of the columns related with vader
vader_cols = ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader']

In [8]:
reviews_df[vader_cols].describe()

Unnamed: 0,Negative_vader,Neutral_vader,Positive_vader,Compound_vader
count,587.0,587.0,587.0,587.0
mean,0.008245,0.859063,0.131,0.469345
std,0.01445,0.152677,0.141421,0.481055
min,0.0,0.0,0.0,-0.5267
25%,0.0,0.719,0.0,0.0
50%,0.0,0.947,0.0,0.0
75%,0.018,1.0,0.2605,0.96875
max,0.094,1.0,0.677,0.9968


In [9]:
# Define features and target variable
X = reviews_df[['CleanReview'] + vader_cols]
y = reviews_df['Rating']

In [10]:
# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
# Create a ColumnTransformer to handle different feature types
# In this case, we'll use TF-IDF for text (customer reviews) and passthrough for other variables
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'CleanReview'),
        ('numeric', 'passthrough', ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader'])
    ])

In [12]:
# Combine the preprocessor with a regressor (RandomForestRegressor in this case)
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [13]:
# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [14]:
# Fit the model on training data
pipeline.fit(X_train, y_train)

# Make predictions on validation set
y_val_pred = pipeline.predict(X_val)

# Evaluate the model on validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Root Mean Squared Error on Validation Set: {rmse_val}')

# Make predictions on test set
y_test_pred = pipeline.predict(X_test)

# Evaluate the model on test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'Root Mean Squared Error on Test Set: {rmse_test}')

Root Mean Squared Error on Validation Set: 0.8783403882905056
Root Mean Squared Error on Test Set: 1.0111079653124897


# Tests for chatbot

In [16]:
import pickle

path = "./models/rating_rf_model.pkl"

with open(path, 'wb') as file:
    pickle.dump(pipeline, file)

print(f"✅ File {path} was saved successfully")

✅ File ./models/rating_rf_model.pkl was saved successfully


In [1]:
import pandas as pd
import numpy as np
import nltk
from data_utils import sentiment_preprocessor
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing: tokenization and lemmatization
lemmatizer = WordNetLemmatizer()
sent_tokenizer = PunktSentenceTokenizer()

# Sentiment Analysis with VADER
nltk.download('vader_lexicon')

# Load data
reviews_df = pd.read_csv('./data/reviews.csv')
reviews_df['CleanReview'] = reviews_df['Review'].apply(lambda review: sentiment_preprocessor(
    review, lowercase=False,
    leave_punctuation=True,
    lemmatization=False,
    tokenized_output=False))

# Analyze polarity and add results to the dataframe
vader = SentimentIntensityAnalyzer()
reviews_df['Vader'] = reviews_df['CleanReview'].apply(lambda x: vader.polarity_scores(x))
reviews_df['Negative_vader'] = reviews_df['Vader'].apply(lambda x: x['neg'])
reviews_df['Neutral_vader'] = reviews_df['Vader'].apply(lambda x: x['neu'])
reviews_df['Positive_vader'] = reviews_df['Vader'].apply(lambda x: x['pos'])
reviews_df['Compound_vader'] = reviews_df['Vader'].apply(lambda x: x['compound'])
reviews_df.drop('Vader', axis=1, inplace=True)

# Name of the columns related to Vader
vader_cols = ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader']

# Define features and target variable
X = reviews_df[['CleanReview'] + vader_cols]
y = reviews_df['Rating']

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create a ColumnTransformer to handle different feature types
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'CleanReview'),
        ('numeric', 'passthrough', ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader'])
    ])

# Combine the preprocessor with a regressor (RandomForestRegressor in this case)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# RegressorWrapper for the RandomForestRegressor model
class RegressorWrapper:
    def __init__(self, model, features, text_column):
        self.model = model
        self.features = features
        self.text_column = text_column

    def predict(self, x_observation: pd.DataFrame) -> float:
        text_data = x_observation[self.text_column].values
        numeric_features = x_observation[self.features[1:]].values
        combined_features = np.concatenate((self._preprocess_text(text_data), numeric_features), axis=1)
        result = self.model.predict(combined_features)
        return result[0]

    def _preprocess_text(self, text_data):
        return self.model.named_steps['preprocessor'].named_transformers_['text'].transform(text_data)

    def prediction_needs(self, verbosity=True):
        if verbosity:
            return f"You need to provide the values of {self.features} to get a prediction."
        else:
            return self.features

# Create the RegressorWrapper
regressor_wrapper = RegressorWrapper(model=pipeline, features=vader_cols, text_column='CleanReview')

# Make predictions on the validation set
y_val_pred = pipeline.predict(X_val)

# Evaluate the model on the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Root Mean Squared Error on Validation Set: {rmse_val}')

# Make predictions on the test set
y_test_pred = pipeline.predict(X_test)

# Evaluate the model on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'Root Mean Squared Error on Test Set: {rmse_test}')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abdar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Root Mean Squared Error on Validation Set: 0.8783403882905056
Root Mean Squared Error on Test Set: 1.0111079653124897


In [1]:
import pandas as pd
import numpy as np
import nltk
from data_utils import sentiment_preprocessor
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing: tokenization and lemmatization
lemmatizer = WordNetLemmatizer()
sent_tokenizer = PunktSentenceTokenizer()

# Sentiment Analysis with VADER
nltk.download('vader_lexicon')

# Load data
reviews_df = pd.read_csv('./data/reviews.csv')
reviews_df['CleanReview'] = reviews_df['Review'].apply(lambda review: sentiment_preprocessor(
    review, lowercase=False,
    leave_punctuation=True,
    lemmatization=False,
    tokenized_output=False))

# Analyze polarity and add results to the dataframe
vader = SentimentIntensityAnalyzer()
reviews_df['Vader'] = reviews_df['CleanReview'].apply(lambda x: vader.polarity_scores(x))
reviews_df['Negative_vader'] = reviews_df['Vader'].apply(lambda x: x['neg'])
reviews_df['Neutral_vader'] = reviews_df['Vader'].apply(lambda x: x['neu'])
reviews_df['Positive_vader'] = reviews_df['Vader'].apply(lambda x: x['pos'])
reviews_df['Compound_vader'] = reviews_df['Vader'].apply(lambda x: x['compound'])
reviews_df.drop('Vader', axis=1, inplace=True)

# Name of the columns related to Vader
vader_cols = ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader']

# Define features and target variable
X = reviews_df[['Review'] + vader_cols]
y = reviews_df['Rating']

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create a ColumnTransformer to handle different feature types
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'Review'),
        ('numeric', 'passthrough', ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader'])
    ])

# Combine the preprocessor with a regressor (RandomForestRegressor in this case)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = pipeline.predict(X_val)

# Evaluate the model on the validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f'Root Mean Squared Error on Validation Set: {rmse_val}')

# Make predictions on the test set
y_test_pred = pipeline.predict(X_test)

# Evaluate the model on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f'Root Mean Squared Error on Test Set: {rmse_test}')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abdar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Root Mean Squared Error on Validation Set: 0.8764622968070616
Root Mean Squared Error on Test Set: 1.0115388809825356


In [2]:
# RegressorWrapper for the RandomForestRegressor model
class RegressorWrapper:
    def __init__(self, model, text_column):
        self.model = model
        self.text_column = text_column

    def predict(self, raw_review: str) -> float:
        # Preprocess the review and extract relevant features
        clean_review = sentiment_preprocessor(
            raw_review, lowercase=False, leave_punctuation=True, lemmatization=False, tokenized_output=False
        )
        review_df = pd.DataFrame({
            'Review': clean_review,
            'Negative_vader': [vader.polarity_scores(clean_review)['neg']],
            'Neutral_vader': [vader.polarity_scores(clean_review)['neu']],
            'Positive_vader': [vader.polarity_scores(clean_review)['pos']],
            'Compound_vader': [vader.polarity_scores(clean_review)['compound']]
        })
        
        # Make predictions using the model
        result = self.model.predict(review_df)
        return result[0]

    def prediction_needs(self, verbosity=True):
        return f"You only need to provide the raw review text to get a prediction."

# Create the RegressorWrapper
regressor_wrapper = RegressorWrapper(model=pipeline, text_column='Review')

Test

In [3]:
# Assuming you have a raw review text that you want to predict the rating for
raw_review_text = "This product is amazing! I love it."

# Use the RegressorWrapper to make predictions
predicted_rating = regressor_wrapper.predict(raw_review_text)

print(f'Predicted Rating for the Review: {round(predicted_rating, 2)}')


Predicted Rating for the Review: 3.99
