In [1]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [2]:
#Load the UNITEN data
df = pd.read_csv("UNITENReview.csv")

In [3]:
#Setup Tools & Dictionaries
spell = Speller(lang='en')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Custom Slang Dictionary (Adding 'w' for the UNITEN dataset)
slang_dict = {
    "tbh": "to be honest", "imo": "in my opinion", "w": "good",
    "idk": "i do not know", "src": "student representative council"
}

In [4]:
#Define the Cleaning Functions
def clean_text(text):
    if not isinstance(text, str): return ""
    
    #Lowercasing
    text = text.lower()
    
    #Remove HTML/URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    text = BeautifulSoup(text, "html.parser").get_text()
    
    #Remove Emojis
    text = emoji.replace_emoji(text, replace='')
    
    #Replace Slang & Contractions
    for slang, formal in slang_dict.items():
        text = re.sub(r'\b'+slang+r'\b', formal, text)
        
    #Remove Punctuation and Numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    
    #Spell Check and Stopwords
    text = spell(text)
    words = [w for w in text.split() if w not in stop_words]
    
    return " ".join(words)

#Apply and Save
df["Cleaned_Review"] = df["Review"].apply(clean_text)
df.to_csv("Processed_UNITEN_Reviews.csv", index=False)

print("Pre-processing complete. File saved!")

Pre-processing complete. File saved!
