In [None]:
import pandas as pd
import re
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
train_data = pd.read_csv('./train_new.csv')
train_data.head()

Unnamed: 0,category,brand,sub_category,product_description,market,review_title,review_text,review_rating
0,Fabric Care,Tide,Laundry,"Tide Pods He Turbo Laundry Detergent Pacs Tub,...",US,The best in a very crowded market,The best general wash detergent. Convenient co...,5.0
1,Fabric Care,Tide,Laundry,"Tide Washing Machine Cleaner, 5 Count",US,First time,This helped to clean our washing machine after...,5.0
2,Home Care,Cascade,Auto Dishwashing,Cascade Platinum ActionPacs Dishwasher Deterge...,US,I've been using another well known brand and d...,I've been using another well known brand and d...,5.0
3,Fabric Care,Tide,Laundry,"Tide Purclean Liquid Laundry Detergent, Honey ...",US,Great laundry detergent,Smell is great and clothes are always clean. G...,5.0
4,Home Care,Mr Clean,Surface Care,"Mr. Clean Magic Eraser Cleaning Pads, 8-Count Box",US,Five Stars,"Good product, works well.",5.0


In [None]:
train_data = pd.DataFrame(train_data)
train_data.head()

Unnamed: 0,category,brand,sub_category,product_description,market,review_title,review_text,review_rating
0,Fabric Care,Tide,Laundry,"Tide Pods He Turbo Laundry Detergent Pacs Tub,...",US,The best in a very crowded market,The best general wash detergent. Convenient co...,5.0
1,Fabric Care,Tide,Laundry,"Tide Washing Machine Cleaner, 5 Count",US,First time,This helped to clean our washing machine after...,5.0
2,Home Care,Cascade,Auto Dishwashing,Cascade Platinum ActionPacs Dishwasher Deterge...,US,I've been using another well known brand and d...,I've been using another well known brand and d...,5.0
3,Fabric Care,Tide,Laundry,"Tide Purclean Liquid Laundry Detergent, Honey ...",US,Great laundry detergent,Smell is great and clothes are always clean. G...,5.0
4,Home Care,Mr Clean,Surface Care,"Mr. Clean Magic Eraser Cleaning Pads, 8-Count Box",US,Five Stars,"Good product, works well.",5.0


In [None]:
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS
stop_words = ["product"] + list(stop_words) #the first part adds other words

In [None]:
# Preprocess function as defined by the user
def preprocess(text):
    text = str(text)   # added this later for this project as there were floats
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = " ".join(text.split())  # Strip whitespace
    text = text.split()
    text = [x for x in text if x not in stop_words]  # Remove stopwords
    text = [x for x in text if x not in ["dr", "doctor"]]  # Remove task-specific stopwords
    text = " ".join(text)
    return text

# Apply the preprocessing to the review text
train_data['text'] = train_data['review_text'].apply(lambda x: preprocess(x))
train_data

Unnamed: 0,category,brand,sub_category,product_description,market,review_title,review_text,review_rating,text
0,Fabric Care,Tide,Laundry,"Tide Pods He Turbo Laundry Detergent Pacs Tub,...",US,The best in a very crowded market,The best general wash detergent. Convenient co...,5.0,best general wash detergent convenient contain...
1,Fabric Care,Tide,Laundry,"Tide Washing Machine Cleaner, 5 Count",US,First time,This helped to clean our washing machine after...,5.0,helped clean washing machine getting review co...
2,Home Care,Cascade,Auto Dishwashing,Cascade Platinum ActionPacs Dishwasher Deterge...,US,I've been using another well known brand and d...,I've been using another well known brand and d...,5.0,ive using known brand didnt expect difference ...
3,Fabric Care,Tide,Laundry,"Tide Purclean Liquid Laundry Detergent, Honey ...",US,Great laundry detergent,Smell is great and clothes are always clean. G...,5.0,smell great clothes clean great im happy
4,Home Care,Mr Clean,Surface Care,"Mr. Clean Magic Eraser Cleaning Pads, 8-Count Box",US,Five Stars,"Good product, works well.",5.0,good works
...,...,...,...,...,...,...,...,...,...
42196,Home Care,Cascade,Auto Dishwashing,"Cascade ActionPacs Dishwasher Detergent, Origi...",CA,A bit pricey - but I prefer them,I''''m a coupon shopper and I watch prices ver...,5.0,im coupon shopper watch prices closely purchas...
42197,Fabric Care,Downy,Fabric Enhancer,DOWNY 40 L AF,CA,Great product,It has been a while since I used a liquid fabr...,5.0,used liquid fabric softener forgotten great do...
42198,Fabric Care,Gain,Laundry,"Gain Botanicals Plant Based Laundry Detergent,...",CA,Gain mostly unscented - hooray!,I've always found Gain's regular scent a littl...,3.0,ive gains regular scent little overly strong d...
42199,Home Care,Cascade,Auto Dishwashing,"Cascade ActionPacs Dishwasher Detergent, Origi...",CA,Amazing!!,I used to use traditional powder soap in my di...,5.0,used use traditional powder soap dishwasher tr...


In [None]:
# Function to calculate TF-IDF
def calTFIDF(texts, max_features=None):
    vectorizer = TfidfVectorizer(max_features=max_features, lowercase=True, stop_words=stop_words)
    TFIDF = vectorizer.fit_transform(texts)
    TFIDF = pd.DataFrame(TFIDF.toarray(), columns=vectorizer.get_feature_names_out())
    return TFIDF

In [None]:
# Calculate the TF-IDF with a maximum of 250 features
TFIDF = calTFIDF(train_data['text'], max_features=25)

In [None]:
TFIDF

Unnamed: 0,cascade,clean,clothes,collected,detergent,dishes,dishwasher,fresh,good,great,...,really,review,scent,smell,smells,time,use,used,using,works
0,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.482674,0.000000,0.508371,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.500116,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,1.000000,0.000000
3,0.000000,0.383292,0.431624,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.713271,...,0.00000,0.000000,0.000000,0.397532,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.658622,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.752474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42196,0.828433,0.000000,0.000000,0.325112,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.319832,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
42197,0.000000,0.000000,0.000000,0.215245,0.0,0.000000,0.000000,0.266633,0.000000,0.380304,...,0.53196,0.211750,0.000000,0.423915,0.0,0.0,0.207013,0.255904,0.000000,0.000000
42198,0.000000,0.195745,0.220427,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.918480,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
42199,0.000000,0.000000,0.000000,0.309633,0.0,0.342095,0.391026,0.000000,0.000000,0.000000,...,0.00000,0.304605,0.000000,0.304904,0.0,0.0,0.297791,0.368122,0.000000,0.000000


In [None]:
train_data = pd.concat([train_data, TFIDF], axis=1)
train_data.head()

Unnamed: 0,category,brand,sub_category,product_description,market,review_title,review_text,review_rating,text,cascade,...,really,review,scent,smell,smells,time,use,used,using,works
0,Fabric Care,Tide,Laundry,"Tide Pods He Turbo Laundry Detergent Pacs Tub,...",US,The best in a very crowded market,The best general wash detergent. Convenient co...,5.0,best general wash detergent convenient contain...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Fabric Care,Tide,Laundry,"Tide Washing Machine Cleaner, 5 Count",US,First time,This helped to clean our washing machine after...,5.0,helped clean washing machine getting review co...,0.0,...,0.0,0.500116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Home Care,Cascade,Auto Dishwashing,Cascade Platinum ActionPacs Dishwasher Deterge...,US,I've been using another well known brand and d...,I've been using another well known brand and d...,5.0,ive using known brand didnt expect difference ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Fabric Care,Tide,Laundry,"Tide Purclean Liquid Laundry Detergent, Honey ...",US,Great laundry detergent,Smell is great and clothes are always clean. G...,5.0,smell great clothes clean great im happy,0.0,...,0.0,0.0,0.0,0.397532,0.0,0.0,0.0,0.0,0.0,0.0
4,Home Care,Mr Clean,Surface Care,"Mr. Clean Magic Eraser Cleaning Pads, 8-Count Box",US,Five Stars,"Good product, works well.",5.0,good works,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.752474


In [None]:
# pip install vaderSentiment
# only needed once

Collecting vaderSentiment
  Obtaining dependency information for vaderSentiment from https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/126.0 kB ? eta -:--:--
   ------ -------------------------------- 20.5/126.0 kB 320.0 kB/s eta 0:00:01
   ------------------- ------------------- 61.4/126.0 kB 648.1 kB/s eta 0:00:01
   -------------------------------------- 126.0/126.0 kB 925.7 kB/s eta 0:00:00
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [None]:

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Example function using VADER for sentiment analysis
def get_vader_sentiment(review):
    sentiment = analyzer.polarity_scores(   str(review)   )   # added str because
    return sentiment['compound']  # Compound score represents overall sentiment

# Applying VADER sentiment analysis on the review column
train_data['vader_sentiment'] = train_data['review_text'].apply(get_vader_sentiment)

train_data.head()

Unnamed: 0,category,brand,sub_category,product_description,market,review_title,review_text,review_rating,text,cascade,...,review,scent,smell,smells,time,use,used,using,works,vader_sentiment
0,Fabric Care,Tide,Laundry,"Tide Pods He Turbo Laundry Detergent Pacs Tub,...",US,The best in a very crowded market,The best general wash detergent. Convenient co...,5.0,best general wash detergent convenient contain...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6369
1,Fabric Care,Tide,Laundry,"Tide Washing Machine Cleaner, 5 Count",US,First time,This helped to clean our washing machine after...,5.0,helped clean washing machine getting review co...,0.0,...,0.500116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4019
2,Home Care,Cascade,Auto Dishwashing,Cascade Platinum ActionPacs Dishwasher Deterge...,US,I've been using another well known brand and d...,I've been using another well known brand and d...,5.0,ive using known brand didnt expect difference ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5267
3,Fabric Care,Tide,Laundry,"Tide Purclean Liquid Laundry Detergent, Honey ...",US,Great laundry detergent,Smell is great and clothes are always clean. G...,5.0,smell great clothes clean great im happy,0.0,...,0.0,0.0,0.397532,0.0,0.0,0.0,0.0,0.0,0.0,0.9594
4,Home Care,Mr Clean,Surface Care,"Mr. Clean Magic Eraser Cleaning Pads, 8-Count Box",US,Five Stars,"Good product, works well.",5.0,good works,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.752474,0.6124
