## Import Libraries

In [1]:
# import data analysis-related libraries
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import math
from wordcloud import WordCloud

# text data processing
import re
from nltk.corpus import stopwords
import spacy
from collections import Counter


## Data Loading and Cleaning

In [14]:
# read the cleaned dataset
cleaned_dataset = pd.read_csv("final_cleaned_data.csv")
cleaned_dataset.head()

Unnamed: 0,link,name,price,rating,total_sold,store_name,store_location,product_description,customer_satisfaction,IsRated,total_rating,num_reviews,review_rating_ratio,brand_name
0,https://www.tokopedia.com/samsung/samsung-gala...,Samsung Galaxy A05s 6/128GB,1999000.0,2.5,8000,Samsung Official Store,Jakarta,"""awesome offers! - free travel adapter 25w sen...",99.0,False,4155.0,1475.0,0.354994,Samsung
1,https://www.tokopedia.com/samsung/samsung-gala...,Samsung Galaxy A25 5G 8/256GB,4049000.0,2.5,750,Samsung Official Store,Jakarta,spesifikasi -processor : octa-core -size : 6.5...,98.0,False,438.0,169.0,0.385845,Samsung
2,https://www.tokopedia.com/distriponsel/xiaomi-...,Xiaomi Redmi 14C 8/256 GB 6/128 GB Redmi 14 C ...,1385000.0,2.5,500,Distributor Ponsel,Jakarta,"untuk produk xiaomi, vivo, realme, oppo, samsu...",98.0,False,206.0,78.0,0.378641,VIVO
3,https://www.tokopedia.com/tecnoofficialstore/t...,"TECNO POVA 6 - 12+12GB*+256GB, 70W Ultra Charg...",2769000.0,2.5,500,Tecno Official Store,Jakarta,keunggulan: mediatek helio g99 ultimate 6nm 60...,98.0,False,371.0,169.0,0.455526,TECNO
4,https://www.tokopedia.com/tecnoofficialstore/t...,"TECNO SPARK 30C – 6+6GB*+128GB, 120Hz Display,...",1429000.0,2.5,250,Tecno Official Store,Jakarta,keunggulan: 48m main camera sony imx582 sensor...,97.0,False,150.0,77.0,0.513333,TECNO


#### Insight:
- apparently, there are still incorrect data content for IsRated column. some products have received some rating but still flagged as false since the rating data were not scraped and then filled by 2.5.
- to mitigate this issue, we need to replace isRated column with correct value and replace the 2.5 rating value with:
```equation
estimated rating = 4.5 * customer_satisfaction/100 + 2.5*(1-customer_satisfaction)/100
```
It is known that customer_satisfaction = proportion number of people rated 4-5 on the product.

In [16]:
# apparently, there are incorrect data filling process. so let's change it
# change True for non-zero total_rating
cleaned_dataset['IsRated'] = cleaned_dataset['total_rating'] != 0

# replace the default 2.5 rating value with new rating value
# Calculate the estimated rating for rows where Rating = 2.5
cleaned_dataset['new_rating'] = cleaned_dataset.apply(
    lambda row: (
        (4.5 * row['customer_satisfaction'] / 100) + (2.5 * (1 - row['customer_satisfaction'] / 100))
        if row['rating'] == 2.5
        else row['rating']
    ),
    axis=1
)

# show result
cleaned_dataset.head()


Unnamed: 0,link,name,price,rating,total_sold,store_name,store_location,product_description,customer_satisfaction,IsRated,total_rating,num_reviews,review_rating_ratio,brand_name,new_rating
0,https://www.tokopedia.com/samsung/samsung-gala...,Samsung Galaxy A05s 6/128GB,1999000.0,2.5,8000,Samsung Official Store,Jakarta,"""awesome offers! - free travel adapter 25w sen...",99.0,True,4155.0,1475.0,0.354994,Samsung,4.48
1,https://www.tokopedia.com/samsung/samsung-gala...,Samsung Galaxy A25 5G 8/256GB,4049000.0,2.5,750,Samsung Official Store,Jakarta,spesifikasi -processor : octa-core -size : 6.5...,98.0,True,438.0,169.0,0.385845,Samsung,4.46
2,https://www.tokopedia.com/distriponsel/xiaomi-...,Xiaomi Redmi 14C 8/256 GB 6/128 GB Redmi 14 C ...,1385000.0,2.5,500,Distributor Ponsel,Jakarta,"untuk produk xiaomi, vivo, realme, oppo, samsu...",98.0,True,206.0,78.0,0.378641,VIVO,4.46
3,https://www.tokopedia.com/tecnoofficialstore/t...,"TECNO POVA 6 - 12+12GB*+256GB, 70W Ultra Charg...",2769000.0,2.5,500,Tecno Official Store,Jakarta,keunggulan: mediatek helio g99 ultimate 6nm 60...,98.0,True,371.0,169.0,0.455526,TECNO,4.46
4,https://www.tokopedia.com/tecnoofficialstore/t...,"TECNO SPARK 30C – 6+6GB*+128GB, 120Hz Display,...",1429000.0,2.5,250,Tecno Official Store,Jakarta,keunggulan: 48m main camera sony imx582 sensor...,97.0,True,150.0,77.0,0.513333,TECNO,4.44


In [18]:
# save second version of cleaned data
# cleaned_dataset.to_csv("final_cleaned_data_v2.csv",index=False)

## Text Data Processing

In [None]:
# 

## Other Exploratory Data Analysis