In [2]:
# Import Libraries
import json
import pandas as pd
import re
import emoji
from textblob import TextBlob
from nltk.corpus import stopwords
import nltk

In [3]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/argha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
# Load customer reviews JSON file
file_path = "../data/customer_reviews.json"
cleaned_file_path = "../data/cleaned_customer_reviews.json"

In [7]:
with open(file_path, "r", encoding="utf-8") as file:
    reviews = json.load(file)

In [8]:
# Convert JSON to DataFrame
df = pd.DataFrame(reviews)

In [9]:
# Show 1st 5 reviews
df.head()

Unnamed: 0,customer_id,comment,rating
0,1484,Too expensive for what it offers.,3
1,2641,Too expensive for what it offers.,2
2,4920,Stopped working after a few days. Not durable.,1
3,2505,Perfect for my needs. Will order again soon.,5
4,4203,I received a different product than what I ord...,2


In [10]:
# Shape of DataFrame
df.shape

(300, 3)

In [11]:
# Check for missing values
df.isnull().sum()

customer_id    0
comment        0
rating         0
dtype: int64

In [12]:
# Check duplicates
df.duplicated().sum()

0

In [13]:
# Ensure ratings are within the range of 1-5
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")  # Convert to numeric
df["rating"] = df["rating"].clip(1, 5)  # Clip values between 1 and 5


In [14]:
# Clean text data (Remove special characters, extra spaces, and stop words)
stop_words = set(stopwords.words("english"))

In [15]:
def clean_text(text):
    text = emoji.demojize(text)  # Convert emojis to text
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]  # Remove stop words
    return " ".join(words)

In [16]:
df["comment"] = df["comment"].astype(str).apply(clean_text)

In [17]:
# Convert informal text (slangs, typos) into proper words using TextBlob
def correct_text(text):
    return str(TextBlob(text).correct())

In [18]:
df["comment"] = df["comment"].apply(correct_text)

In [19]:
# Sanity check
df.head()

Unnamed: 0,customer_id,comment,rating
0,1484,expensive offers,3
1,2641,expensive offers,2
2,4920,Stopped working days unable,1
3,2505,Perfect needs order soon,5
4,4203,received different product ordered,2


In [21]:
with open(cleaned_file_path, "w", encoding="utf-8") as file:
    json.dump(df.to_dict(orient="records"), file, indent=4, ensure_ascii=False)