# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string

pd.options.display.max_colwidth = 500

In [2]:
nltk.download('stopwords')
nltk.download('words')
nltk.download('brown')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aloysius/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/aloysius/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /home/aloysius/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/aloysius/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aloysius/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aloysius/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
shopee_train_df = pd.read_csv("./data/train.csv")
add_train_df = pd.read_csv("./data/shopee_reviews.csv")
test_df = pd.read_csv("./data/test.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
add_train_df = add_train_df[add_train_df['label'] != 'label']
add_train_df = add_train_df.astype({"label": "int32"})
add_train_df = add_train_df.rename(columns = {"label": "rating", "text": "review"})
add_train_df

Unnamed: 0,rating,review
0,5,Looks ok. Not like so durable. Will hv to use a while to recommend others of its worth.
1,5,"Tried, the current can be very powerful depending on the setting, i don't dare to go higher but if go higher sure muscle will become sore and can see the effect faster."
2,5,"Item received after a week. Looks smaller than expected, can’t wait to try!"
3,5,Thanks!!! Works as describe no complaints. Not really expecting any life changing results but thanks!
4,5,Fast delivery considering it’s from overseas and only tried once. Not sure about the results yet.
...,...,...
1502570,5,Tried and it fit nicely will buy \nFast and good will buy again.
1502571,5,It's a fantastic product. Very reasonable price and very good quality. The seller replies to all queries and is very professional. The pacaging was done very nicely and delivered within 10days of purchase.
1502572,5,"Nice pair of shoes for a gd price. Note the cutting is a bit smaller than usual, I bought size 43, but is a bit tight compared to my usual size 43. So maybe recommended to buy 0.5 size bigger."
1502573,1,Leather? Rubbish....moulded plastic. What a a fake...how can shoppee allow such products to be sold.


In [5]:
tdf = pd.concat([shopee_train_df.drop('review_id', axis = 1), add_train_df])
tdf['rating'].value_counts()

5    1345447
4     143994
3      85024
1      43139
2      31781
Name: rating, dtype: int64

In [6]:
train_df = pd.DataFrame(columns = tdf.columns.values)
for _, group in tdf.groupby("rating"):
    group = group.sample(n = min(len(group), 60000), random_state = 42)
    train_df = pd.concat([train_df, group])
train_df

Unnamed: 0,review,rating
1144147,Expiry date was 6mths away. Not informed on website when bought the product. Ended up buying 2 but returned 1 as there is no way to use 2 in 6 months time. Quite disappointed with the expiry date and having to go through the hassle of return etc,1
1466803,Misleading post. Does not state correctly what is being sold.,1
646361,its thin and doesnt fit properly,1
9024,"FAKE redmi AIRDOTS, I WILL REFUND MA !!!!! From 11:11 Shopee sale I sold 599 peso",1
1171820,"Delivered earlier than expected... however, size was way too small than what i ordered ( i ordered M). Cloth quality also seemed different from the one on the picture...",1
...,...,...
1150409,Delivery was prompt and goods received in good condition.,5
1015100,Item looks like picture and came well packaged. Delivery was also pretty quick!,5
1494378,Received with thanks. Cute n small bag.,5
1069508,Received but there a white stain on the internal cover else overall is good.,5


In [7]:
test_df

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg models. Delivery cepet. Tp packing less okay krn only wear clear plastic nerawang klihtan contents jd"
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. Product wrap properly. No damage on the item.
4,5,it's sooooo cute! i like playing with the glitters better than browsing on my phone now. item was also deliered earlier than i expected. thank you seller! may you have more buyers to come. 😊😊😊
...,...,...
60422,60423,"Product has been succesfully ordered and shipped very quickly, get the goods on time use, product quality, value for money, save time."
60423,60424,Opening time a little scared. Fear dalemnya destroyed there yangg eh was not broken at all 😍
60424,60425,The product quality is excellent. The original product. The product price is very good. Delivery speed is very good. The response is very good seller.\nDelivery cepet bangettt makasihhh
60425,60426,They 're holding up REALLY well also .


In [8]:
punc_table = str.maketrans('', '', string.punctuation)

def process_text(text):
    text = ''.join(i for i in text if ord(i) < 128)       # remove non-ascii
    text = text.lower()                                   # lowercase
    text = text.strip()                                   # remove trailing whitespaces
    text = re.sub(r'\s+', ' ', text)                      # remove multiple whitespaces
    tokens = nltk.tokenize.word_tokenize(text)            # tokenize into words
    tokens = [w.translate(punc_table) for w in tokens]    # remove punctuations from each token
    tokens = list(filter(None, tokens))                   # remove empty strings
    return tokens

In [9]:
stop_words = set(nltk.corpus.stopwords.words("english"))
stop_words.remove("not")

def remove_stopwords(text):
    return list(filter(lambda word: word not in stop_words, text))

In [10]:
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
def get_wordnet_pos(word):
    # Map POS tag to first character lemmatize() accepts
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

def lemmatize_tokens(tokens):
    lemmatized = []
    for token in tokens:
        lemmatized.append(wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(token)))
    return lemmatized

In [11]:
words = set(w.lower() for w in nltk.corpus.words.words() if w.isalpha())
brown = set(w.lower() for w in nltk.corpus.brown.words() if w.isalpha())
english_words = words.union(brown)
english_words.remove("ga")

def remove_non_english_words(text):
    return list(filter(lambda word: str(word) in english_words, text))

In [12]:
def preprocess(column):
    tcolumn = column.copy()
    
    print("Processing Text")
    tcolumn = tcolumn.map(process_text)
    
    print("Removing Stop Words")
    tcolumn = tcolumn.map(remove_stopwords)
    
    print("Lemmatizing Tokens")
    tcolumn = tcolumn.map(lemmatize_tokens)
    
    print("Removing Non English Words")
    tcolumn = tcolumn.map(remove_non_english_words)

    return tcolumn

In [13]:
train_df_tok = train_df.copy()
train_df_tok['review'] = preprocess(train_df_tok['review'])
train_df_tok = train_df_tok[train_df_tok['review'].map(len) != 0]
train_df_tok

Processing Text
Removing Stop Words
Lemmatizing Tokens
Removing Non English Words


Unnamed: 0,review,rating
1144147,"[expiry, date, away, not, inform, bought, product, end, buying, return, way, use, month, time, quite, disappointed, expiry, date, go, hassle, return]",1
1466803,"[mislead, post, not, state, correctly, sell]",1
646361,"[thin, doesnt, fit, properly]",1
9024,"[fake, refund, sale, sell, peso]",1
1171820,"[deliver, earlier, expect, however, size, way, small, order, order, cloth, quality, also, seem, different, one, picture]",1
...,...,...
1150409,"[delivery, prompt, good, receive, good, condition]",5
1015100,"[item, look, like, picture, come, well, package, delivery, also, pretty, quick]",5
1494378,"[receive, thanks, cute, n, small, bag]",5
1069508,"[receive, white, stain, internal, cover, else, overall, good]",5


In [14]:
def limit_duplicated_words(df, column, ratio):
    tdf = df.copy()
    tdf = tdf[tdf[column].map(lambda x: (len(x) / len(set(x))) <= ratio)]
    return tdf

In [15]:
train_df_clean = train_df_tok.copy()

print("Removing lines with many duplicated words")
train_df_clean = limit_duplicated_words(train_df_clean, "review", 2)
train_df_clean["review"] = train_df_clean["review"].map(lambda x: " ".join(x))

print("Removing duplicated entries")
dedup = []
for review, group in train_df_clean.groupby('review'):
    dedup.append([review, round(float(group['rating'].mean()))])
train_df_clean = pd.DataFrame(dedup, columns = ["review", "rating"])

train_df_clean

Removing lines with many duplicated words
Removing duplicated entries


Unnamed: 0,review,rating
0,aa aba,3
1,aa advertised functional value money great delivery service prompt,4
2,aa advertised repeat order really super affordable good stable surface worth buying,5
3,aa item receive described good condition,5
4,aaa battery power source k lap dc,2
...,...,...
226838,zipper spoil,3
226839,zipper work pure nonsense fall da trap like buy guy,1
226840,zo garment hood milwaukee milwaukee star liver dc k new leg cut fade cast zo dc,4
226841,zoo lion mitten booty set right size newborn hot air balloon set big least month old fit hole sew quality not satisfy seller respond ask refund,1


In [16]:
train_df_clean['rating'].value_counts()

5    55182
4    52981
3    50801
1    39166
2    28713
Name: rating, dtype: int64

In [17]:
test_df_clean = test_df.copy()
test_df_clean['review'] = preprocess(test_df_clean['review'])
test_df_clean['review'] = test_df_clean['review'].map(lambda x: " ".join(x))
test_df_clean

Processing Text
Removing Stop Words
Lemmatizing Tokens
Removing Non English Words


Unnamed: 0,review_id,review
0,1,great danger cool motif model delivery pack less okay wear clear plastic content
1,2,one shade fit well
2,3,comfortable
3,4,fast delivery product expiry product wrap properly damage item
4,5,cute like play glitter well browsing phone item also earlier expect thank seller may buyer come
...,...,...
60422,60423,product order ship quickly get good time use product quality value money save time
60423,60424,opening time little scar fear destroyed eh not broken
60424,60425,product quality excellent original product product price good delivery speed good response good seller delivery
60425,60426,hold really well also


In [18]:
train_df_clean.to_csv("./data/train_clean.csv", index = False)
test_df_clean.to_csv("./data/test_clean.csv", index = False)