## Data Cleaning

### End Goal
The goal of this notebook is to clean the data and prepare it for analysis. This includes combining the datasets, replacing N.A. values in reviews and filtering out rows where there was no rating given.

### Import Dependencies

In [24]:
import pandas as pd
import re
import numpy as np
# for detection of none english text
from langdetect import detect, DetectorFactory
# for translation purposes
from deep_translator import GoogleTranslator
# for handling emojis
import emoji

# For consistency in language detection
DetectorFactory.seed = 42

### Import the datasets

In [12]:
df_apify = pd.read_csv('../data/google_reviews_apify_dataset.csv')
df_kaggle =pd.read_csv('../data/kaggle_reviews_raw.csv')

In [13]:
# Dataset Statistics
print(f"Kaggle: {len(df_kaggle)} reviews, Apify: {len(df_apify)} reviews")
print("Kaggle columns:", df_kaggle.columns.tolist())
print("Apify columns:", df_apify.columns.tolist())

Kaggle: 1100 reviews, Apify: 1000 reviews
Kaggle columns: ['business_name', 'author_name', 'text', 'photo', 'rating', 'rating_category']
Apify columns: ['title', 'url', 'stars', 'name', 'reviewUrl', 'text']


### Standardize columns

In [14]:
df_apify_filtered = df_apify[["title", "stars", "text", "name"]].rename(columns={
    "title": "store_name",
    "stars": "rating",
    "text": "review",
    "name":"reviewer_name",
})

df_kaggle_filtered = df_kaggle[["business_name", "rating", "text", "author_name"]].rename(columns={
    "business_name": "store_name",
    "rating": "rating",
    "text": "review",
    "author_name": "reviewer_name",
})


In [15]:
print("Updated column headers for df_apify:", df_apify_filtered.columns.tolist())
print("Updated column headers for df_kaggle:", df_kaggle_filtered.columns.tolist())

Updated column headers for df_apify: ['store_name', 'rating', 'review', 'reviewer_name']
Updated column headers for df_kaggle: ['store_name', 'rating', 'review', 'reviewer_name']


### Combining the datasets

In [16]:
df_combined = pd.concat([df_apify_filtered, df_kaggle_filtered], ignore_index=True)

In [17]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   store_name     2100 non-null   object
 1   rating         2100 non-null   int64 
 2   review         1845 non-null   object
 3   reviewer_name  2100 non-null   object
dtypes: int64(1), object(3)
memory usage: 65.8+ KB


### Replacing Rows with no reviews with "No reviews"

In [19]:
df_combined["review"] = df_combined["review"].replace("", "No reviews").replace(np.nan, "No reviews")

In [None]:
# confirming no missing values
df_combined["review"].isna().any()

np.False_

### Handling of emojis in reviews

In [None]:
def replace_emojis(review):
    if emoji.emoji_count(review) > 0:
        # Replace emojis with their text representation
        review = emoji.demojize(review, delimiters=("", ""))
    return review

df_combined["review"] = df_combined["review"].apply(replace_emojis)

In [26]:
# check for presence of emojis in reviews
df_combined["review"].apply(emoji.emoji_count).sum()

np.int64(0)

### Translation for non-english reviews

In [42]:
df_combined["review"] = df_combined["review"].str.replace("\n", " ").str.replace("/", " ")

def detect_and_translate(review):
    language = detect(review)
    if language != 'en':
        try:
            translated_review = GoogleTranslator(source='auto', target='en').translate(review)
            return translated_review
        except Exception as e:
            print(f"Translation error for review: {review}. Error: {e}")
            return review
    else:
        return review

df_combined_translated = df_combined.copy()
df_combined_translated["review"] = df_combined["review"].apply(detect_and_translate)

### Remove entries which are still in other languages

In [None]:
df_combined_translated = df_combined_translated[
    df_combined_translated["review"].apply(lambda x: detect(str(x)) == "en")
].reset_index(drop=True)


In [45]:
count = 0
for review in df_combined_translated["review"]:
    if detect(review) != 'en':
        print("Non-English review found after translation:", review)
        count += 1
print("Total non-English reviews found:", count)

Total non-English reviews found: 0


### Generate cleaned and updated file

In [46]:
pd.DataFrame(df_combined_translated).to_csv('../data/cleaned_reviews.csv', index=False)

In [48]:
df_combined_translated.shape

(2037, 4)