### Import Required Packagaes

In [10]:
import spacy
import pandas as pd
import re
from transformers import pipeline

### Clean Review Texts

In [11]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

### Keywords

In [12]:
ad_keywords = [
    "promo", "discount", "offer", "buy now", "free", "click here", "visit",
    "limited time", "sale", "deal", "coupon", "special offer", "subscribe",
    "register now", "sign up", "exclusive", "order now", "save big",
    "hot deal", "shop now", "get it now", "today only", "claim your", "bonus"
]

rant_keywords = [
    "worst", "terrible", "horrible", "disappointed", "never again", "awful", 
    "hate", "angry", "scam", "frustrated", "rage", "don't bother", "unacceptable"
]

spam_keywords = [
    "check out", "visit us", "see more", "learn more", "follow us", "join now", 
    "click here", "buy now", "limited time offer", "hurry up", "call us", 
    "guaranteed", "satisfaction", "money back", "don't miss", "one-time offer"
]

irrelevant_keywords = [
    "weather", "politics", "sports", "personal opinion", "life advice", 
    "not related", "unrelated", "not about the place", "miscellaneous"
]


emotional_keywords = [
    "love", "hate", "best", "worst", "amazing", "terrible", "perfect", 
    "disgusted", "impressed", "thrilled", "excited", "devastated", "infuriating"
]

### Check Keywords

In [13]:
def contains_keywords(text, keywords):
    return any(keyword in text for keyword in keywords)

### Flag Reviews based on Keywords

In [14]:
def flag_policy_violations(text):
    if contains_keywords(text, ad_keywords):
        return 'Ad'
    elif contains_keywords(text, rant_keywords):
        return 'Rant'
    elif contains_keywords(text, spam_keywords):
        return 'Spam'
    elif contains_keywords(text, irrelevant_keywords):
        return 'Irrelevant'
    elif contains_keywords(text, emotional_keywords):
        return 'Emotional'
    else:
        return 'Valid'

### Tokenization
This is for training the ML model.

In [15]:
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

### All Caps Ratio
Some reviews may contain lots of capitalisation, indicating frustration. These could be rants or spam.

In [16]:
def all_caps_ratio(text):
    words = text.split()
    caps_words = [word for word in words if word.isupper()]
    return len(caps_words) / len(words) if len(words) > 0 else 0

### Sentiment Analysis - Relevancy Score
Analyse the emotions in each review. Very negative reviews could potentially be rants.

In [17]:
sentiment_analyzer = pipeline("sentiment-analysis")

def calculate_relevancy_score(text):
    # Can adjust this score based on sentiment, length, and keyword presence
    # For example, the higher the sentiment score and review length, the more relevant it is
    sentiment_score = sentiment_analyzer(text)[0]['score']
    review_length = len(text.split())
    caps_ratio = all_caps_ratio(text)
    
    # Simple heuristic: higher sentiment, longer review, lower caps_ratio = more trustworthy
    relevancy_score = (sentiment_score * 40) + (review_length * 0.5) - (caps_ratio * 20)
    return min(max(int(relevancy_score), 0), 100)  # Ensure the score is between 0 and 100

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


### Feature Engineering

In [19]:
data_path = '../data/clean/cleaned_combined_reviews.csv'
df = pd.read_csv(data_path).head(1000)

df['cleaned_review_text'] = df['review_text'].apply(clean_text)
df['tokens'] = df['cleaned_review_text'].apply(tokenize)
df['review_length'] = df['review_text'].apply(len)
df['sentiment'] = df['review_text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])
df['all_caps_ratio'] = df['cleaned_review_text'].apply(all_caps_ratio)
df['relevancy_score'] = df['cleaned_review_text'].apply(calculate_relevancy_score)
df['policy_label'] = df['cleaned_review_text'].apply(flag_policy_violations)

# Save the results to a CSV file
data_path = '../data/with_features/cleaned_combined_reviews_with_features.csv'
df.to_csv(data_path, index=False)

df.head(10)

Unnamed: 0,user_name,review_text,rating,cleaned_review_text,tokens,review_length,sentiment,all_caps_ratio,relevancy_score,policy_label
0,Amber Thibeault,Andrea is amazing. Our dog loves her and she a...,5,andrea is amazing our dog loves her and she al...,"[andrea, amazing, dog, loves, looks, amazing, ...",95,POSITIVE,0.0,48,Emotional
1,Esther,Andrea does a wonderful job with our wild Pr...,5,andrea does a wonderful job with our wild pr...,"[andrea, wonderful, , job, , wild, princess,...",90,POSITIVE,0.0,47,Emotional
2,Bob Barrett,Never called back,1,never called back,[called],17,POSITIVE,0.0,41,Valid
3,Luz Quiles,They don't answer the phones,3,they dont answer the phones,"[nt, answer, phones]",28,NEGATIVE,0.0,42,Valid
4,Tim Sanderson,Limited information on the website,3,limited information on the website,"[limited, information, website]",34,NEGATIVE,0.0,42,Valid
5,Ellen Nastir,Leigh-Ann is an incredibly creative facilitato...,5,leighann is an incredibly creative facilitator...,"[leighann, incredibly, creative, facilitator, ...",251,POSITIVE,0.0,61,Ad
6,Jinnie Lee Schmid,Leigh Ann Rodgers is THE undisputed expert in ...,5,leigh ann rodgers is the undisputed expert in ...,"[leigh, ann, rodgers, undisputed, expert, team...",649,POSITIVE,0.0,89,Emotional
7,Wanda Walker,Leigh Ann Rodgers is a great collaborator and ...,5,leigh ann rodgers is a great collaborator and ...,"[leigh, ann, rodgers, great, collaborator, ste...",251,POSITIVE,0.0,59,Emotional
8,Heather Clarke,I really appreciate all the wisdom and experie...,5,i really appreciate all the wisdom and experie...,"[appreciate, wisdom, experience, leigh, ann, r...",188,POSITIVE,0.0,54,Valid
9,Cheryle Maurer,Leigh Ann’s masterful facilitation and engagem...,5,leigh anns masterful facilitation and engageme...,"[leigh, anns, masterful, facilitation, engagem...",171,POSITIVE,0.0,52,Valid
