In [None]:
import pandas as pd

df = pd.read_csv("../data/clean/cleaned_combined_reviews.csv")
df.info()

In [None]:
# Convert 'user_name' and 'review_text' to string type
df['user_name'] = df['user_name'].astype(str)
df['review_text'] = df['review_text'].astype(str)

### Review Length
The review length could let us categorise very short reviews as irrelevant. Long reviews could potentially be rants.

In [None]:
df["review_length"] = df["review_text"].apply(len)
df.head(10)

### Tokenize Review Text
This is for training the ML model.

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

df2 = df.copy().head(100)  # Using a smaller subset for demonstration

tokens_list = []
for text in df2["review_text"]:
    doc = nlp(text)
    tokens_list.append([token.text for token in doc])

df2["tokens"] = tokens_list
df2.head(10)

### Sentiment Analysis
Analyse the emotions in each review. Very negative reviews could potentially be rants.

In [None]:
from transformers import pipeline


#Load the pre-trained sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
labels = []
scores = []

for text in df2["review_text"]:
    result = sentiment_analyzer(text)[0]  # returns list of dicts
    label = result['label']
    score = result['score']
    labels.append(label)
    scores.append(score)

df2["sentiment"] = labels
df2["confidence score"] = scores
df2.head(10)

### All Caps Ratio
Some reviews may contain lots of capitalisation, indicate frustration. These could be rants.

In [None]:
def all_caps_ratio(text):
    caps = 0
    for c in text:
        if c.isalpha() and c.isupper():
            caps += 1
    if len(text) == 0:
        return 0
    return (caps / len(text))

ratios = []

for text in df2["review_text"]:
    ratios.append(all_caps_ratio(text))

df2["all caps ratio"] = ratios

df2.head(10)

### Extract keywords
Find keywords that are usually found in advertisements to mark the reviews as advertisements.

In [None]:
import re

ad_keywords = [
    "promo", "discount", "offer", "buy now", "free", "click here", "visit",
    "limited time", "sale", "deal", "coupon", "special offer", "subscribe",
    "register now", "sign up", "exclusive", "order now", "save big",
    "hot deal", "shop now", "get it now", "today only", "claim your", "bonus"
]

def contains_ad_keywords(text):
    text_lower = text.lower()
    # ads will be marked with 1, the rest as 0
    return int(any(re.search(r'\b'+kw+r'\b', text_lower) for kw in ad_keywords))


ads = []
for text in df2["review_text"]:
    ad = contains_ad_keywords(text)
    ads.append(ad)

df2["ad"] = ads

ad_rows = df2[df2["ad"] == 1]


In [None]:
ad_rows

In [None]:
with_features_path = "../data/with_features/with_features.csv"
df.to_csv(with_features_path, index=False)