# Initial Path and libraries

In [2]:
import pandas as pd
from pathlib import Path


PROJECT_ROOT = Path.cwd()
PROCESSED = PROJECT_ROOT / "data" / "processed"


# Reviews_enriched 

## Dataset

In [3]:
path = PROCESSED / "reviews_enriched.parquet"
reviews_enriched_df = pd.read_parquet(path, engine="pyarrow")
print(reviews_enriched_df.shape)
reviews_enriched_df.describe()

(98410, 22)


Unnamed: 0,review_score,bad_review,review_creation_date,review_answer_timestamp,response_time_days,text_length,order_delivered_customer_date,order_estimated_delivery_date,delivery_delay_days,payment_installments,payment_value,response_time_negative,is_delivered,has_delivery_delay
count,98410.0,98410.0,98410,98410,98410.0,98410.0,95601,98410,95601.0,98409.0,98409.0,98410.0,98410.0,98410.0
mean,4.088802,0.146286,2018-01-13 06:50:35.984147968,2018-01-16 10:26:43.416786944,3.150086,28.350401,2018-01-14 18:28:20.021411840,2018-01-24 08:57:59.939030784,-11.212855,2.927253,160.719938,0.0,0.971476,0.971456
min,1.0,0.0,2016-10-02 00:00:00,2016-10-07 18:32:28,0.089225,0.0,2016-10-11 13:46:32,2016-09-30 00:00:00,-146.016123,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,2017-09-24 00:00:00,2017-09-27 19:43:43.500000,1.004499,0.0,2017-09-26 16:48:01,2017-10-04 00:00:00,-16.247558,1.0,61.96,0.0,1.0,1.0
50%,5.0,0.0,2018-02-02 00:00:00,2018-02-05 11:08:52,1.673993,0.0,2018-02-02 21:33:45,2018-02-15 00:00:00,-11.963137,2.0,105.28,0.0,1.0,1.0
75%,5.0,0.0,2018-05-16 00:00:00,2018-05-20 14:08:15.750000128,3.102049,42.0,2018-05-16 13:45:48,2018-05-28 00:00:00,-6.405116,4.0,176.83,0.0,1.0,1.0
max,5.0,1.0,2018-08-31 00:00:00,2018-10-29 12:27:35,518.699213,208.0,2018-10-17 13:22:46,2018-11-12 00:00:00,188.975081,24.0,13664.08,0.0,1.0,1.0
std,1.345758,0.353394,,,9.914724,48.3541,,,10.108965,2.713002,220.426067,0.0,0.166464,0.166521


In [4]:
cols_needed = [
    "review_id",
    "order_id",
    "review_score",
    "bad_review",
    "has_text",
    "text_length",
    "text_clean",
    "review_creation_date",
]
print("review_id unique:", reviews_enriched_df["review_id"].is_unique)
print(
    reviews_enriched_df[cols_needed].isna().mean().sort_values(ascending=False).head(10)
)

review_id unique: True
review_id               0.0
order_id                0.0
review_score            0.0
bad_review              0.0
has_text                0.0
text_length             0.0
text_clean              0.0
review_creation_date    0.0
dtype: float64


## % of has_text

In [None]:
has_text_porcentage = (reviews_enriched_df["has_text"]).mean() * 100
no_text_porcentage = 100 - has_text_porcentage
print("% of reviews without comments", no_text_porcentage)
print("% of reviews with comments", has_text_porcentage)
print("Count:", reviews_enriched_df["has_text"].value_counts(dropna=False))


% of reviews without comments 58.70236764556447
% of reviews with comments 41.29763235443553
Count: has_text
False    57769
True     40641
Name: count, dtype: int64


Approximately 4 out of 10 reviews contain text, so any sentiment/text analysis is perfomed on this subset 

## Usable text: is there text in all rows or is there an empty string?

In [6]:
reviews_enriched_df["text_clean"] = reviews_enriched_df["text_clean"].fillna("")
reviews_enriched_df["text_clean_len"] = reviews_enriched_df["text_clean"].str.len()

usable_text = reviews_enriched_df["text_clean_len"] > 0
print("Usable text %:", (usable_text.mean() * 100))
usable_text

Usable text %: 41.29763235443553


0        False
1        False
2         True
3        False
4        False
         ...  
98405    False
98406    False
98407     True
98408    False
98409    False
Name: text_clean_len, Length: 98410, dtype: bool

## Distribution of text_length

In [7]:
print(reviews_enriched_df["text_length"].describe())

count    98410.000000
mean        28.350401
std         48.354100
min          0.000000
25%          0.000000
50%          0.000000
75%         42.000000
max        208.000000
Name: text_length, dtype: float64


In [8]:
df_text = reviews_enriched_df[reviews_enriched_df["has_text"]].copy()
print("\ntext_length describe (has_text=True):")
print(df_text["text_length"].describe())

# short text proportions
for k in [1, 3, 5, 10, 20, 30]:
    pct = (df_text["text_length"] <= k).mean() * 100
    print(f"<= {k} chars: {pct:.2f}%")


text_length describe (has_text=True):
count    40641.000000
mean        68.647277
std         53.808996
min          1.000000
25%         28.000000
50%         53.000000
75%         95.000000
max        208.000000
Name: text_length, dtype: float64
<= 1 chars: 0.23%
<= 3 chars: 1.85%
<= 5 chars: 3.25%
<= 10 chars: 8.58%
<= 20 chars: 17.45%
<= 30 chars: 27.57%


## Bad reviews:

In [9]:
print("Bad review overall:")
print(reviews_enriched_df["bad_review"].value_counts(normalize=True).mul(100).round(2))

print("\nBad review with has_text=True:")
print(df_text["bad_review"].value_counts(normalize=True).mul(100).round(2))


Bad review overall:
bad_review
0    85.37
1    14.63
Name: proportion, dtype: float64

Bad review with has_text=True:
bad_review
0    73.53
1    26.47
Name: proportion, dtype: float64


## Crosstable: review_score x has_text

In [None]:
import pandas as pd

tab = (
    pd.crosstab(
        reviews_enriched_df["review_score"],
        reviews_enriched_df["has_text"],
        normalize="index",
    )
    * 100
)
tab = tab.round(2)
tab


has_text,False,True
review_score,Unnamed: 1_level_1,Unnamed: 2_level_1
1,23.42,76.58
2,31.98,68.02
3,56.43,43.57
4,68.77,31.23
5,64.12,35.88


## Example large/short texts

In [None]:
short_text = df_text.sort_values("text_length").head(10)[
    [
        "review_score",
        "bad_review",
        "text_length",
        "review_comment_message",
        "text_clean",
    ]
]
short_text

Unnamed: 0,review_score,bad_review,text_length,review_comment_message,text_clean
32183,5,0,1,*,*
24692,5,0,1,s,s
72418,4,0,1,S,s
11264,5,0,1,.,.
90053,4,0,1,.,.
51705,4,0,1,s,s
70690,5,0,1,K,k
75115,5,0,1,üòÄ,üòÄ
66892,4,0,1,-,-
79154,4,0,1,?,?


In [None]:
long_text = df_text.sort_values("text_length", ascending=False).head(10)[
    [
        "review_score",
        "bad_review",
        "text_length",
        "review_comment_message",
        "text_clean",
    ]
]
long_text


Unnamed: 0,review_score,bad_review,text_length,review_comment_message,text_clean
6889,1,1,208,"N√ÉO RECEBI O PRODUTO, O PRODUTO CONSTA COMO EN...","n√£o recebi o produto, o produto consta como en..."
23188,4,0,207,Adquiri 2 mochilas pelo site lannister de e pa...,adquiri 2 mochilas pelo site lannister de e pa...
76645,1,1,206,Comprei o produto confiando nas lojas lanniste...,comprei o produto confiando nas lojas lanniste...
78379,1,1,205,fiz essa compra pelo baratheon comprei um conj...,fiz essa compra pelo baratheon comprei um conj...
21242,1,1,204,Na minha compra realizada em 20/09/17 a loja t...,na minha compra realizada em 20/09/17 a loja t...
16417,5,0,204,"Parab√©ns ,o produto tira rico da empresa targa...","parab√©ns ,o produto tira rico da empresa targa..."
19979,1,1,204,A targaryen n√£o √© de confian√ßa n√£o entregou a ...,a targaryen n√£o √© de confian√ßa n√£o entregou a ...
30448,4,0,204,O produto cumpre totalmente com o que o fabric...,o produto cumpre totalmente com o que o fabric...
20056,3,0,204,N√£o Montei ele ainda mais pelo que conferi est...,n√£o montei ele ainda mais pelo que conferi est...
40739,1,1,204,A EMPRESA FAZ PROPAGANDA ENGANOSA EM SEU SITE....,a empresa faz propaganda enganosa em seu site....


## top words

### Import nltk and CountVectorizer


In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

nltk.download("stopwords")
portuguese_stopword = stopwords.words("portuguese")

### frequency and vocabulary without stopwords

In [None]:
sample = reviews_enriched_df["text_clean"].fillna("")
# words that appear at least 20 times(inside reviews)
cv = CountVectorizer(min_df=20, stop_words=portuguese_stopword)
X = cv.fit_transform(sample)

frequency = X.sum(axis=0).A1
vocabulary = cv.get_feature_names_out()

word_frequency = pd.Series(frequency, index=vocabulary).sort_values(ascending=False)
word_frequency.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jesus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


produto      18289
prazo         8431
entrega       6482
antes         5605
chegou        5523
recebi        5210
bom           4573
recomendo     4314
entregue      3753
veio          3250
dtype: int64