### Import Required Packagaes

In [8]:
import spacy
import pandas as pd
import re
from transformers import pipeline

### Clean Review Texts

In [9]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

### Tokenization
This is for training the ML model.

In [10]:
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

### All Caps Ratio
Some reviews may contain lots of capitalisation, indicating frustration. These could be rants or spam.

In [11]:
def all_caps_ratio(text):
    words = text.split()
    caps_words = [word for word in words if word.isupper()]
    return len(caps_words) / len(words) if len(words) > 0 else 0

### Sentiment Analysis - Relevancy Score
Analyse the emotions in each review. Very negative reviews could potentially be rants.

In [12]:
sentiment_analyzer = pipeline("sentiment-analysis")

def calculate_relevancy_score(text):
    # Can adjust this score based on sentiment, length, and keyword presence
    # For example, the higher the sentiment score and review length, the more relevant it is
    sentiment_score = sentiment_analyzer(text)[0]['score']
    review_length = len(text.split())
    caps_ratio = all_caps_ratio(text)
    
    # Simple heuristic: higher sentiment, longer review, lower caps_ratio = more trustworthy
    relevancy_score = (sentiment_score * 40) + (review_length * 0.5) - (caps_ratio * 20)
    return min(max(int(relevancy_score), 0), 100)  # Ensure the score is between 0 and 100

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


### Feature Engineering

In [13]:
def engineer_features(df):
    df['cleaned_review_text'] = df['review_text'].apply(clean_text)
    df['tokenized_review'] = df['cleaned_review_text'].apply(tokenize)
    df['review_length'] = df['tokenized_review'].apply(len)
    df['all_caps_ratio'] = df['review_text'].apply(all_caps_ratio)
    df['sentiment'] = df['cleaned_review_text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])
    df['relevancy_score'] = df['review_text'].apply(calculate_relevancy_score)
    return df

### Extract Features from Qwen and Hand-labelled Data

In [14]:
qwen_data_path = '../data/label/qwen_labelled_combined_reviews.csv'
qwen_df = pd.read_csv(qwen_data_path).copy()
qwen_feature_df = engineer_features(qwen_df)

hand_data_path = '../data/label/hand_labelled_combined_reviews.csv'
hand_df = pd.read_csv(hand_data_path).copy()
hand_feature_df = engineer_features(hand_df)


# Save the results to CSV files
qwen2_data_path = '../data/with_features/qwen_labelled_combined_reviews_with_features.csv'
qwen_feature_df.to_csv(qwen2_data_path, index=False)

hand2_data_path = '../data/with_features/hand_labelled_combined_reviews_with_features.csv'
hand_feature_df.to_csv(hand2_data_path, index=False)