In [7]:
import pandas as pd
import re
import emoji
import string
import nltk
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

spell = Speller(lang='en')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

slang_dict = {
    "w": "good", "tbh": "to be honest", "imo": "in my opinion",
    "yg": "yang", "tpi": "tapi", "tak": "tidak"}

def preprocess_uniten(text):
    if not isinstance(text, str) or text == "#NAME?": 
     return "" # Fix for #NAME?
    
    text = text.encode('ascii', 'ignore').decode('utf-8', 'ignore') 
    text = text.lower() 

    text = re.sub(r'http\S+|www\S+', '', text) 

    text = BeautifulSoup(text, "html.parser").get_text() 

    text = emoji.replace_emoji(text, replace='') 

    words = text.split()
    text = " ".join([slang_dict.get(w, w) for w in words])
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = re.sub(r'\d+', '', text)
    
    words = text.split()
    corrected_words = [spell(w) if len(w) > 3 else w for w in words]
    text = " ".join(corrected_words)

    words = text.split()
    text = " ".join([w for w in words if w not in stop_words])

    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    def get_wordnet_pos(tag):
        if tag.startswith('J'): return wordnet.ADJ
        elif tag.startswith('V'): return wordnet.VERB
        elif tag.startswith('N'): return wordnet.NOUN
        elif tag.startswith('R'): return wordnet.ADV
        return wordnet.NOUN
    
    lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(t)) for w, t in pos_tags]
    return lemmatized


df = pd.read_csv("UNITENReview.csv")
df["processed"] = df["Review"].apply(preprocess_uniten)
df.to_csv("Processed_UNITENReviews.csv", index=False)
print(df[["Review", "processed"]].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/0ef5607c-92a3-4202-a7d3-
[nltk_data]     326ceacd22cf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/0ef5607c-92a3-4202-a7d3-
[nltk_data]     326ceacd22cf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/0ef5607c-92a3-4202-a7d3-
[nltk_data]     326ceacd22cf/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/0ef5607c-92a3-4202-a7d3-
[nltk_data]     326ceacd22cf/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                              Review  \
0  Im happy with uniten actually, even the people...   
1  Iâ€™m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                           processed  
0   [im, happy, unite, actually, even, people, good]  
1  [im, pretty, good, time, happy, meet, good, pe...  
2                 [neutral, place, term, everything]  
3  [would, say, united, good, university, issue, ...  
4  [united, wellregarded, particularly, strong, e...  
