### Key functions to work on to clean / explore data 

In [None]:
import re

import nltk
import numpy as np
import numpy.typing as npt
import pandas as pd

# nltk.download("wordnet")

In [None]:
## lowercase


def convert_lowercase(arr: npt.ArrayLike) -> npt.ArrayLike:
    return arr.str.lower()


# remove html tags


def remove_html_tags(arr: npt.ArrayLike) -> npt.ArrayLike:
    return arr.str.replace(pat=r"<.*?>", repl="", regex=True)


import string

string.punctuation

# remove punctuations
def remove_punctuation(arr: npt.ArrayLike) -> npt.ArrayLike:
    punctuations = str.maketrans(dict.fromkeys(string.punctuation))
    return arr.str.translate(punctuations)


test_arr = pd.Series(["this SIUH siu<b><sflsn>.,,,,,,,,,,", "ashiuf,,,,, asSIUHF"])

In [None]:
remove_html_tags(test_arr)


def strip():
    pass

In [None]:
test_text = "sdfhousd() *((#@"

remove_punctuation(test_text)

In [None]:
acronyms = {
    "asap": "as soon as possible",
    "btw": "by the way",
    "diy": "do it yourself",
    "fb": "facebook",
    "fomo": "fear of missing out",
    "fyi": "for your information",
    "g2g": "got to go",
    "idk": "i don't know",
    "imo": "in my opinion",
    "irl": "in real life",
    "lmao": "laughing my ass off",
    "lmk": "let me know",
    "lol": "laugh out loud",
    "msg": "message",
    "noyb": "none of your business",
    "omg": "oh my god",
    "rofl": "rolling on the floor laughing",
    "smh": "shaking my head",
    "tmi": "too much information",
    "ttyl": "talk to you later",
    "wth": "what the hell",
    "yolo": "you only live once",
}


# abbreviation dictionary
def convert_abbreviations(text: str) -> str:
    return " ".join(
        acronyms.get(word) if word in acronyms.keys() else word for word in text.split()
    )

In [None]:
# remove stopwords
from nltk.corpus import stopwords


def remove_stopwords(text: str) -> str:
    stopwords_english = set(stopwords.words("english"))
    return " ".join(word for word in text.split() if word not in stopwords_english)

In [None]:
### tokenization
from nltk.tokenize import word_tokenize


def tokenize_words(text: str) -> npt.ArrayLike:
    return word_tokenize(text)

In [None]:
## stemming

from nltk.stem.porter import PorterStemmer


def porter_stemming(word_arr: npt.ArrayLike) -> npt.ArrayLike:
    ps = PorterStemmer()
    return [ps.stem(word) for word in word_arr]

In [None]:
## lemmatization function

from nltk.stem import WordNetLemmatizer


def lemmatize(word_arr: npt.ArrayLike) -> npt.ArrayLike:
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in word_arr]

In [None]:
word_arr = ["snowing", "fiery", "this", "movie", "lacking"]

In [None]:
print(lemmatize(word_arr))

print(porter_stemming(word_arr))