In [3]:
# Clean the data
import spacy
import langdetect
from tqdm.notebook import tqdm

In [2]:
# Load the language model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Create a function to clean the data
def clean_data(df):
    # Create a dictionary to store the values
    new_df = {"label": [], "text": []}

    # Iterate over all rows in the dataset
    for row in tqdm(range(len(df.loc[:, "text"].to_list()))):
        # Initialize temporary array to store tokens
        tmp_tokens = []

        try:
            # # Check whether the review is written in English or not
            # if langdetect.detect(df.loc[row, "text"]) == "en" or True:
                for token in nlp(df.loc[row, "text"]):
                    # Set conditions to retain valuable information
                    if (
                        not token.is_stop  # remove stop-words
                        and not token.is_punct  # remove punctuation
                        and not token.like_num  # remove numbers
                        and token.is_oov  # remove words that don't have a word vector
                        and not token.is_space  # remove whitespaces
                        and len(token) > 1  # remove single-letter words
                        # Remove tokens that looks weird & not useful
                        and not str(token).endswith("-")
                        and not str(token).endswith(".")
                        and not any(
                            substr in str(token)
                            for substr in [
                                "---",
                                "--",
                                "/2",
                                "/1",
                                "20feb",
                                "c17",
                                "\x92",
                                "&",
                                "%",
                                "i.e.",
                                "b+",
                                "w/",
                                "02:33:05",
                            ]
                        )
                        and not str(token).startswith("-")
                    ):
                        # Get the lemma & lowercase the token
                        token = token.lemma_.lower()
                        if "(" in token:
                            token = token.split("(")
                            tmp_tokens.append(token[0])
                            tmp_tokens.append(token[1])
                        elif token == "orangy/":
                            token = "orangy"
                        elif token == ".fruity":
                            token = "fruity"

                        tmp_tokens.append(token)

                # Append the corresponding label to the review
                new_df["label"].append(df.loc[row, "label"])

                # Add all tokens from the review to the text
                new_df["text"].append(tmp_tokens)
                # Reset the token array
                tmp_tokens = []
        except:
            continue  # proceed to next row if an exception is raised
        
    # Return the new dataframe
    return pd.DataFrame(new_df)