In [17]:
import pandas as pd

df = pd.read_csv("../data/clean/cleaned_combined_reviews.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98165 entries, 0 to 98164
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_name    98165 non-null  object
 1   review_text  98165 non-null  object
 2   rating       98165 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.2+ MB


In [18]:
df["user_name"] = df["user_name"].astype(str)
df["review_text"] = df["review_text"].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98165 entries, 0 to 98164
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_name    98165 non-null  object
 1   review_text  98165 non-null  object
 2   rating       98165 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.2+ MB


### Review length of text

The review length would let us categorise very short reviews is irrelevant. Long reviews could potentially be rants.

In [19]:
# extract lengths of each review
df["review_length"] = df["review_text"].apply(len)
df.head(10)

Unnamed: 0,user_name,review_text,rating,review_length
0,Amber Thibeault,Andrea is amazing. Our dog loves her and she a...,5,95
1,Esther,Andrea does a wonderful job with our wild Pr...,5,90
2,Bob Barrett,Never called back,1,17
3,Luz Quiles,They don't answer the phones,3,28
4,Tim Sanderson,Limited information on the website,3,34
5,Ellen Nastir,Leigh-Ann is an incredibly creative facilitato...,5,251
6,Jinnie Lee Schmid,Leigh Ann Rodgers is THE undisputed expert in ...,5,649
7,Wanda Walker,Leigh Ann Rodgers is a great collaborator and ...,5,251
8,Heather Clarke,I really appreciate all the wisdom and experie...,5,188
9,Cheryle Maurer,Leigh Ann’s masterful facilitation and engagem...,5,171


### Tokenize review text (with smaller data)
This is for training the ML model.

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

# made a small dataset first
df2 = df.copy().head(50)

tokens_list = []
for text in df2["review_text"]:
    doc = nlp(text)
    tokens_list.append([token.text for token in doc])

df2["tokens"] = tokens_list
df2.head(10)

Unnamed: 0,user_name,review_text,rating,review_length,tokens
0,Amber Thibeault,Andrea is amazing. Our dog loves her and she a...,5,95,"[Andrea, is, amazing, ., Our, dog, loves, her,..."
1,Esther,Andrea does a wonderful job with our wild Pr...,5,90,"[Andrea, does, a, wonderful, , job, , with, ..."
2,Bob Barrett,Never called back,1,17,"[Never, called, back]"
3,Luz Quiles,They don't answer the phones,3,28,"[They, do, n't, answer, the, phones]"
4,Tim Sanderson,Limited information on the website,3,34,"[Limited, information, on, the, website]"
5,Ellen Nastir,Leigh-Ann is an incredibly creative facilitato...,5,251,"[Leigh, -, Ann, is, an, incredibly, creative, ..."
6,Jinnie Lee Schmid,Leigh Ann Rodgers is THE undisputed expert in ...,5,649,"[Leigh, Ann, Rodgers, is, THE, undisputed, exp..."
7,Wanda Walker,Leigh Ann Rodgers is a great collaborator and ...,5,251,"[Leigh, Ann, Rodgers, is, a, great, collaborat..."
8,Heather Clarke,I really appreciate all the wisdom and experie...,5,188,"[I, really, appreciate, all, the, wisdom, and,..."
9,Cheryle Maurer,Leigh Ann’s masterful facilitation and engagem...,5,171,"[Leigh, Ann, ’s, masterful, facilitation, and,..."


### Sentiment Analysis (with smaller data)
Analyse the emotions in each review. Very negative reviews could potentially be rants.

In [None]:
from transformers import pipeline


#Load the pre-trained sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
labels = []
scores = []

for text in df2["review_text"]:
    result = sentiment_analyzer(text)[0]  # returns list of dicts
    label = result['label']
    score = result['score']
    labels.append(label)
    scores.append(score)

df2["sentiment"] = labels
df2["confidence score"] = scores
df2.head(10)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


Unnamed: 0,user_name,review_text,rating,review_length,tokens,sentiment,confidence score
0,Amber Thibeault,Andrea is amazing. Our dog loves her and she a...,5,95,"[Andrea, is, amazing, ., Our, dog, loves, her,...",POSITIVE,0.999887
1,Esther,Andrea does a wonderful job with our wild Pr...,5,90,"[Andrea, does, a, wonderful, , job, , with, ...",POSITIVE,0.999869
2,Bob Barrett,Never called back,1,17,"[Never, called, back]",NEGATIVE,0.974816
3,Luz Quiles,They don't answer the phones,3,28,"[They, do, n't, answer, the, phones]",NEGATIVE,0.99945
4,Tim Sanderson,Limited information on the website,3,34,"[Limited, information, on, the, website]",NEGATIVE,0.909106
5,Ellen Nastir,Leigh-Ann is an incredibly creative facilitato...,5,251,"[Leigh, -, Ann, is, an, incredibly, creative, ...",POSITIVE,0.999858
6,Jinnie Lee Schmid,Leigh Ann Rodgers is THE undisputed expert in ...,5,649,"[Leigh, Ann, Rodgers, is, THE, undisputed, exp...",POSITIVE,0.999863
7,Wanda Walker,Leigh Ann Rodgers is a great collaborator and ...,5,251,"[Leigh, Ann, Rodgers, is, a, great, collaborat...",POSITIVE,0.999833
8,Heather Clarke,I really appreciate all the wisdom and experie...,5,188,"[I, really, appreciate, all, the, wisdom, and,...",POSITIVE,0.999849
9,Cheryle Maurer,Leigh Ann’s masterful facilitation and engagem...,5,171,"[Leigh, Ann, ’s, masterful, facilitation, and,...",POSITIVE,0.99985


### All caps ratio
Some may use lots of capitalisation to indicate frustration in their reviews and they may be rants.

In [20]:
def all_caps_ratio(text):
    caps = 0
    for c in text:
        if c.isalpha() and c.isupper():
            caps += 1
    if len(text) == 0:
        return 0
    return (caps / len(text))

ratios = []

for text in df["review_text"]:
    ratios.append(all_caps_ratio(text))

df["all caps ratio"] = ratios

df.head(10)

Unnamed: 0,user_name,review_text,rating,review_length,all caps ratio
0,Amber Thibeault,Andrea is amazing. Our dog loves her and she a...,5,95,0.031579
1,Esther,Andrea does a wonderful job with our wild Pr...,5,90,0.044444
2,Bob Barrett,Never called back,1,17,0.058824
3,Luz Quiles,They don't answer the phones,3,28,0.035714
4,Tim Sanderson,Limited information on the website,3,34,0.029412
5,Ellen Nastir,Leigh-Ann is an incredibly creative facilitato...,5,251,0.015936
6,Jinnie Lee Schmid,Leigh Ann Rodgers is THE undisputed expert in ...,5,649,0.023112
7,Wanda Walker,Leigh Ann Rodgers is a great collaborator and ...,5,251,0.035857
8,Heather Clarke,I really appreciate all the wisdom and experie...,5,188,0.047872
9,Cheryle Maurer,Leigh Ann’s masterful facilitation and engagem...,5,171,0.017544


### Extract keywords
Find keywords that are usually found in advertisements to mark the reviews as advertisements.

In [44]:
import re

ad_keywords = [
    "promo", "discount", "offer", "buy now", "free", "click here", "visit",
    "limited time", "sale", "deal", "coupon", "special offer", "subscribe",
    "register now", "sign up", "exclusive", "order now", "save big",
    "hot deal", "shop now", "get it now", "today only", "claim your", "bonus"
]

def contains_ad_keywords(text):
    text_lower = text.lower()
    # ads will be marked with 1, the rest as 0
    return int(any(re.search(r'\b'+kw+r'\b', text_lower) for kw in ad_keywords))


ads = []
for text in df["review_text"]:
    ad = contains_ad_keywords(text)
    ads.append(ad)

df["ad"] = ads

ad_rows = df[df["ad"] == 1]
