# Preprocessing
## Load the data

In [13]:
import os
import pandas as pd
import kagglehub

# Download latest version
path = kagglehub.dataset_download("datafiniti/consumer-reviews-of-amazon-products")

csv_data = []

# Loop through each file
for file in os.listdir(path):
    csv_file = os.path.join(path, file)
    df = pd.read_csv(csv_file, usecols=["reviews.rating", "reviews.text", "reviews.title"])
    df.columns = ["rating", "text", "title"]
    csv_data.append(df)


# Concat data of all csv files it to one dataframe
csv_data = pd.concat(csv_data)

csv_data.head()

Unnamed: 0,rating,text,title
0,3.0,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...
1,4.0,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...
2,5.0,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...
3,5.0,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...
4,5.0,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...


In [17]:
# Remove duplicates
csv_data.drop_duplicates(subset=["text"], inplace=True)

# Check for empty rows
print(csv_data.isnull().sum())

# Remove empty rows
csv_data.dropna(inplace=True)

# Combine text and title columns
csv_data["title_text"] = csv_data["title"] + " " + csv_data["text"]

csv_data.head()

rating    33
text       1
title     17
dtype: int64


Unnamed: 0,rating,text,title,title_text
0,3.0,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,... 3 of them and one of the item is bad quali...
1,4.0,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,... always the less expensive way to go for pr...
2,5.0,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,... are not Duracell but for the price i am ha...
3,5.0,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,... as well as name brand batteries at a much ...
4,5.0,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,... batteries are very long lasting the price ...


## Text preprocessing

In [26]:
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()

def text_preprocessing_pipeline(text):
    """
    Preprocesses a text string by applying standard NLP cleaning steps:
    tokenization, stop word removal, punctuation removal, and lemmatization.

    Parameters:
        text (str): The input text string to preprocess.

    Returns:
        str: A cleaned and lemmatized string with tokens joined by spaces.
    """

    # Remove HTML
    text_without_html = re.sub(r'<[^<>]*>', '', text)

    # Tokenize the text
    tokenized_text = word_tokenize(text_without_html.lower())

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [w for w in tokenized_text if w.lower() not in stop_words]

    # Remove punctuation
    filtered_tokens = [w for w in filtered_tokens if w not in string.punctuation]

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]

    return " ".join(lemmatized_tokens)

csv_data["title_text"] = csv_data["title_text"].apply(text_preprocessing_pipeline)

## Train / Test Split

In [27]:
from sklearn.model_selection import train_test_split

y = csv_data["rating"]
X = csv_data["title_text"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)