In [41]:
import pandas as pd
import numpy as np
import re
import html
import difflib
from textblob import TextBlob
from collections import Counter
import nltk
from nltk.corpus import words as nltk_words
import emoji
from tqdm import tqdm
import wordninja

In [42]:
pd.set_option('display.max_colwidth', None)

nltk.download('words')

tqdm.pandas()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\windown\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# Load dữ liệu 

In [43]:
df_train = pd.read_csv('amazon_shoe_train.csv', encoding= 'utf-8', engine='python', encoding_errors='replace')
df_validation = pd.read_csv('amazon_shoe_test.csv', encoding= 'utf-8', engine='python', encoding_errors= 'replace')

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  90000 non-null  object
 1   text    89966 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [45]:
df_validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  10000 non-null  object
 1   text    9998 non-null   object
dtypes: object(2)
memory usage: 156.4+ KB


# Xóa các dòng dữ liệu trùng lặp, giữ lại dòng đầu tiên

In [46]:
df_train = df_train.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
df_validation = df_validation.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)

In [47]:
df_train

Unnamed: 0,labels,text
0,Positive,Good shoe for office work. They will scuff very easy so be aware.
1,Negative,"I have had the Patricia II wedge in black for about 1 year & wore them regularly in season. When I saw the Patricia at a good price in navy (from 6pm), I purchased them because I thought they would fit just like my Patricia IIs. I was wrong, and paid the price with return shipping that 6pm doesn't pay.<br /><br />The crocs website says that crocs aren't suppose to fit like other sandals - they are suppose to be looser & thus more comfortable - I normally wear an 8-1/2, so have now tried both an 8 & a 9 in the Patricia shoe (I have an 8 in the Patricia II). The Patricia 9 swims on my feet & they would be a hazard to walk around in. The size 8 fits my left foot (which is my wider foot) but is too narrow on my right foot. When I placed the shoes sole to sole, I did notice a slight difference in the width, which, apparently, my foot notices too. I can only conclude a manufacturing defect. But, it is this shoe specifically or the form for this shoe? (others have written the shoe is narrow).<br /><br />Consequently, if you have a wider foot, order the Patricia II instead of this one & if you are a 1/2 size, order down, not up."
2,Negative,Width not right and size too small if width had been just little wider and ordered size larger would have been good. Loved the shoe look
3,Negative,"I received these shoes and they weren't the same as the picture described them, they were a different color. When i tried to return them, the shipping wasn't paid for. So i had to pay $20 for shipping. A waste of time and money. I dont recommend anyone to buy from TheSmartBuy."
4,Neutral,They began to split alone the mesh material after a month but loved the shoe and the feel of it
...,...,...
85072,Neutral,I wear a size 7 in all my shoes but this one was too large. Hurt the back of my feet because they were too big.
85073,Positive,"Love the sunglasses. Love the look. Love the Polarized lenses. Love the price.<br />However, the lenses have popped out twice. Once from a 5 foot fall from my head on a carpeted floor. Another sitting in the case in my gym bag. I emailed customer support and never received a response. Would have been a 5 star but loses a star for poor customer service."
85074,Negative,"Were comfortable the 1st time, but seem to get tighter everytime I wore them 😣 recd many compliments on them but can no longer wear them"
85075,Negative,"Hurts my feet. Like wearing razor blades. The material is too scratchy and I can't even wear them. Even tried with hose on but they are aweful. I wore them to work one day, I stay at a desk mostly and couldn't even tolerate wearing them sitting down. They are attractive but OUCH!!"


In [48]:
print(df_train.info())
print('=' * 40)
print(df_validation.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85077 entries, 0 to 85076
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  85077 non-null  object
 1   text    85076 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9739 entries, 0 to 9738
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  9739 non-null   object
 1   text    9738 non-null   object
dtypes: object(2)
memory usage: 152.3+ KB
None


# Xóa các dòng không có đánh giá

In [49]:
df_train = df_train.dropna(subset= ['text']).reset_index(drop = True)
df_validation = df_validation.dropna(subset = ['text']).reset_index(drop = True)

In [50]:
print(df_train.info())
print('=' * 40)
print(df_validation.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85076 entries, 0 to 85075
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  85076 non-null  object
 1   text    85076 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9738 entries, 0 to 9737
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  9738 non-null   object
 1   text    9738 non-null   object
dtypes: object(2)
memory usage: 152.3+ KB
None


# Trích xuất các đánh giá có chứa url

In [None]:
url_pattern = r'https?://\S+|www\.\S+'

df_with_links = df_train[df_train['text'].str.contains(url_pattern, case=False, na=False)]

df_with_links[['text']]

Unnamed: 0,text
1538,Tortoise shell sunglasses look more like cheetah or leopard spots. They're kind of ridiculous looking. There are much better sunglasses for the same price point. I've had them for a year and am only order more because I lost one pair (which is why I buy such cheap glasses to start with) http://amzn.to/1Cmorhr
5565,http://www.amazon.com/gp/product/B00I3AJRVM?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0
6445,My husband is a custodian and has that are company bought uncomfortable. He was always complaining about his feet hurting. He is a prior owner of these shoes and never complained about his feet when wearing them. I bought these shoes to replace them and he is all good! Thanks for fast delivery and a great price AMAZON PRIME!! https://www.amazon.com/gp/product/B00SDS2WBG/ref=cm_cr_ryp_prd_ttl_sol_0<br />https://images-na.ssl-images-amazon.com/images/I/412H45YLeGL._SS300_.jpg
6481,I bought these to decorate for my best friend's wedding and bachelorette party. What they don't show is on inside of the glasses the words &#34;PUNCH GP3&#34; are on one arm and &#34;www.grinderpunch.com&#34; is on the other. It's very distracting and looks very unprofessional. I definitely would not have purchased these if I knew that beforehand.
7928,"In the original image of the product, the only one that Amazon shows in its page [ http://www.amazon.com/gp/product/B005BIR13A?ref_=pe_1196280_123950170 ], you see the shoes all in the same tones and colors: brown-ish.<br /><br />But when I receive the product I was in shock with the rubber that cover the tip and the sides of the shoes, it is GREY! You can't see that on the image. It was unpleasant. The shoes are fine, they are comfortable, but the colors aren't."
8301,I am returning the Bag because it is simply too large. Other than that it<br />is a pretty bag.https://images-na.ssl-images-amazon.com/images/I/51nKKjW1mhL._SS300_.jpg
8579,"I bought these shoes to use as &#34;tennis&#34; shoes (actually playing tennis) and they just won't work for that, at least for me.<br /><br />The main problem is the back of the shoe is *very* low -- I don't wear high-tops or even mid range, but I need a bit of rise on the end so that as I move the shoe stays attached. These allow my heel to rise out the back even tightly laced, and so just won't work for any kind of strenuous movement (again, for me).<br /><br />I ended up buying the NB 623 ( https://www.amazon.com/gp/product/B007JTXJX0/ref=oh_aui_detailpage_o01_s00?ie=UTF8&psc=1 ) which DO have a rise at the back and fit me much better. They also are cheaper, a nice bonus.<br /><br />Everyone's foot is different, and everyone's shoe's needs are, so these might work for you (and Amazon's return policy allows you to try them out). But overall I'd just pass on these and get the 623."
10515,Bought these for my daughter and she likes them but after a couple of weeks of use they are very beat up. I had purchased her a pair of black dance shoes http://www.amazon.com/gp/product/B0041IXFSQ?psc=1&redirect=true&ref_=oh_aui_detailpage_o00_s00 and she immediately commented that they are a better shoe.
10535,"The sole eroded very quickly, I will probably have to throw it out in soon. Worst part was that I didn't use it everyday. I wear this maybe twice a week at most. I would usually wear a Kenneth Cole Boating Shoe (http://www.amazon.com/gp/product/B00C9QC3MQ?colid=Q24VI7B5IUHV&coliid=IPFRNQS5JW9RW&psc=1&ref_=wl_it_dp_o_pC_nS_ttl), most of the time. The Kenneth Cole shoe has been able to sustain the wear I put on to it for a longer period of time."
12900,Great slippers... I wear them out on my ranch doing casual chores... seams never separate... soles never come off... never come apart before I have literally worn them out.<br /><br />I use foam/gel insoles to make them feel comfy till they are done. About 2 years for me.<br /><br />https://www.amazon.com/gp/product/B003B40NWM/ref=cm_cr_ryp_prd_ttl_sol_61


# Trích xuất các đánh giá có chứa kí tự @

In [52]:
mention_pattern = r'@[A-Za-z0-9_]+'

df_with_mentions = df_train[df_train['text'].str.contains(mention_pattern, na=False)]

df_with_mentions[['text']]

Unnamed: 0,text
2173,"Unbelievable performance, however, be careful about build-up. It may cause your gear to be ruined with too much thickness,,, then it will begin to peel, resulting in the possibility of you having to replace gear. Much needed research on my part to ascertain if there is a stripper to take it off and start over. If there is a removal, five stars coming. If someone knows of a process to remove this product, please let me know. Hurstsa@live.com I would really appreciate a shout-out. Thank you!!"
5942,Loved them but the 8 was too small - tight @toes & length. Disappointed that no size 8.5 available to re-order.
10933,"I adore these shoes, but after one wearing, several rhinestones are missing!! @heartbroken!"
14223,"Been wearing these for years. They're uglier than Crocks (I take offense at people asking...&#34; are those Crocks&#34;? F#@K NO! What do you take me for?) which ups their appeal, and they're incredibly comfortable and kind of form to your foot, rather than sloshing around on them as per Crocks (though I've never worn them but have seen them hanging loosely on people's feet like a falling diaper). I was quite surprised at how easy they were for every day wear in all conditions. Surprisingly they don't stink nor do my feet sweat while wearing them. Hard to find at times, and the Japanese love these things, though I'm not Japanese nor have any affiliation with nor have I been paid for any type of endorsement; to either."
14462,"These shoes are super cute and comfortable! I'm a Nurse and can easily wear them for 10-12 hours a day while doing a whole lot of walking and my feet never hurt. What I liked the Best about them is that they are mostly &#34;mesh&#34; on top which makes for a lightweight shoe.<br />DONOT PURCHASE THEM because after you wear them for approx. 3-4 weeks the mesh rips on Both shoes in two places. A rip/hole will appear on shoe where your big toe begins and on the same place (next to little toe) on other side of shoe😞. I hoped maybe I had simply received a &#34;defective&#34; pair if shoes, so I called Amazon and they promptly sent me a new pair of the exact same shoe.(Just as I requested!) Wore them for a couple weeks, love them so much that when i saw they had the exact Ascic shoe with a &#34;melon color&#34; (the first ones i ordered were black, silver&pink) i HAD to have them too!!(our scrubs@work are black and MELON color) Got the 2nd pair of shoes and switched back and forth, wearing the black/pink pair one day..and wearing the melon colored ones the next day!<br />Was SO VERY DISAPPOINTED when the first hole/tear in the mesh on the black ones appeared. It starts on the big toe side of shoe, right where you 'bend your foot'....then a couple days later, i all of a sudden had FOUR RIPS in the mesh in the EXACT SAME PLACE ON BOTH shoes, as the pair i had first returned to Amazon. Stuck them in their original box and went on to wearing the melon colored shoes with Great Hopes of this NOT occuring on this (3rd) Pair!! Well in about 3 more weeks(that is wearing them 10hrs per day-5 days a week) the mesh starting tearing on the left shoe by my left little toe. Within 3 more wearings, they were ripped/torn IN THE EXACT SAME PLACES on both shoes, AS THE TWO black/pink pairs (one pair of which i returned and other pair that Amazon sent me as the 'Replacement Pair!!'<br />Gotta say that i am super picky about the buying athletic shoes. Ie: No thick rubber bottoms, no heavyness in the shoe, shape on toe has to be right and on and on and on!! So....Needless to say, i was crushed about these ASIC running shoes that turned out to be Defective. They are so very cute, stylish and comfy! Almost every day i wore them, someone at work would say &#34;Cute Shoes, where did you get them!?&#34;<br />To make such a long story short....the last two pairs were sent back to Amazon via UPS, last week and im waiting on my money (paid approx. eighty something for the black pair& approx.$100 for the gray/melon colored ones)to be refunded back to me. Amazon said that they will be doing &#34;an investigation&#34; in to this defect in their ASIC RUNNING SHOES! Hopefully they WILL FIND A WAY TO CORRECT THIS WITHOUT CHANGING THE STYLE/LOOK of these super cute shoes! Also would like to say that Amazon was very good about PROMPTLY REPLACING the first pair with a new pair, and im sure i will receive my money back on the 2nd two pairs (that were sent back) very soon.<br />Im just Sad cause athletic shoes are what i have to wear 5 days out of every 7 AND I LOVED THE LOOK AND COMFORT of those Asic's. But...the mesh just kept ripping on each new pair that i ordered.<br />Hoping that ASIC WILL FIGURE IT OUT AND FIX THIS DEFECT because I SURE WOULD ORDER THESE PARTICULAR SHOES IF I WAS ASSURED THAT THE PROBLEM WAS SOLVED!! Sincerely, Dissapointed Asic Athletic Shoe Buyer"
15657,This are some fake @ss locs. I was excited about them but when I got the they looked like they been painted over and crappy quality. These are faker than your Jordan's
22280,E-mail to dwatson@probation.nyc.gov
32746,"These shoes are grippy and look good. They are comfortable enough during break in. HOWEVER, I never really got to break them in without the back tearing. I've been on only two mtb rides with these. Now I'm sitting in an airport looking like a fool with a ripped shoe. 510, I'd like my money back - trevordunn87@gmail.com"
34763,The Velcro keeps opening. It is very annoying. I feel very insecure when wearing them. think I will have to return them.<br /><br />Leanne King<br />Kinglc57@yahoo.com
42551,"I bought my usual size 6; they fit perfect, looked cute when I tried them on at home. After walking 50 feet (maybe less) from my car to work, the soles are wrecked! Every tiny piece of gravel from the (normal asphalt) street is now embedded in the cheap rubber-like soles. Tried picking out the dozens of pebbles but now there are dozens of holes with cracks around them, spidering out along the soles where each pebble had been. I wish these shoes hadn't fit and I had returned them because they are complete rubbish! Can't wear them into the house without taking them off and scraping out all the pebbles first or they'll scratch the wooden floors. Perhaps they'll last another wear or two but I expect to toss them within 30 days of purchase. They're not even worth the $20 I paid. Shame on you &#34;Spirit Moda&#34; for making such a cr@p product and shame on me for buying it. Lesson learned."


# Trích xuất các đánh giá chỉ gồm các emoji

In [53]:
def is_only_emoji(text):
    if pd.isna(text) or not isinstance(text, str):
        return False
    stripped = text.strip()
    if not stripped:
        return False
    return all(char in emoji.EMOJI_DATA for char in stripped)

In [54]:
emoji_only_df_train = df_train[df_train['text'].apply(is_only_emoji)].copy()
emoji_only_df_train

Unnamed: 0,labels,text
2549,Positive,👌🏻
6623,Positive,👍🏼
7735,Neutral,👍
10315,Positive,😊
13231,Positive,👍🏻
14118,Positive,💘
16529,Positive,👌
27153,Neutral,👍🏿
28473,Positive,👍👍👍👍👍👍👍
32662,Positive,😍


# Kiểm tra các từ viết tắt hoặc tiếng lóng (Slang)

In [55]:
english_vocab = set(w.lower() for w in nltk_words.words())

all_words = []

for text in df_train['text']:
    words_in_text = str(text).lower().split()
    all_words.extend(words_in_text)

tokens = [word for word in all_words if word.isalpha()]

word_freq = Counter(tokens)

In [56]:
suspected_slang = [
    word for word in word_freq
    if word not in english_vocab and word_freq[word] > 10
]

suspected_slang = sorted(suspected_slang, key=lambda x: -word_freq[x])
print(len(suspected_slang))

1291


In [57]:
for word in suspected_slang[:2000]:
        print(f"{word}: {word_freq[word]}")

shoes: 22123
feet: 7145
has: 4236
looks: 2663
sandals: 2312
toes: 2217
fits: 2127
purchased: 2121
larger: 1941
expected: 1921
straps: 1856
months: 1847
loved: 1842
started: 1602
pairs: 1588
wanted: 1573
looked: 1568
loves: 1511
years: 1493
seems: 1391
hours: 1364
returning: 1340
makes: 1337
reviews: 1317
buying: 1251
ones: 1167
feels: 1106
runs: 1087
liked: 1041
needed: 1016
stars: 997
weeks: 994
sunglasses: 993
socks: 992
wears: 970
arrived: 951
having: 884
lenses: 873
hoping: 871
heels: 830
wider: 826
seemed: 791
lasted: 763
laces: 755
using: 744
flops: 732
ordering: 706
blisters: 679
velcro: 647
paid: 606
slippers: 601
says: 589
compliments: 589
insoles: 565
things: 556
others: 541
sneakers: 537
noticed: 528
asics: 524
hiking: 510
compared: 503
owned: 469
crocs: 444
minutes: 437
online: 416
gets: 407
pleased: 405
held: 393
ripped: 384
walked: 380
footbed: 380
expecting: 370
problems: 370
inserts: 360
surprised: 358
clarks: 353
likes: 353
issues: 350
okay: 349
stretched: 344
nike: 33

# Định nghĩa các dictionaries cho ánh xạ

## Mapping cho các từ ngữ thô tục

In [58]:
profanity_map = {
    r'\bf[\W_]*[u@*#!]+[\W_]*c[\W_]*k[\w]*\b': 'fuck',
    r'\bf[\W_]*[@]+[\W_]*c[\W_]*h[\w]*\b': 'fuck',    
    r'\bs[\W_]*h[\W_]*[i1!*]+[\W_]*t[\w]*\b': 'shit',
    r'\bb[\W_]*[i1l!*@/]?[\W_]*t[\W_]*c[\W_]*h[\w]*\b': 'bitch',
    r'\ba[\W_]*s[\W_]*s[\W_]*h[\W_]*o[\W_]*l[\W_]*e[\w]*\b': 'asshole',
    r'\bp[\W_]*i[\W_]*s[\W_]*s[\w]*\b': 'piss',
    r'\b[a@]+[\W_]*[s$]+[\W_]*[s$]+[\w]*\b': 'ass',
    r'\bc[\W_]*r[\W_]*[@]+[\W_]*p[\w]*\b': 'crap',
    r'\bb[\W_]*[i1l!*@/]+[\W_]*t[\W_]*c[\W_]*h[\W_]*e[\W_]*s[\w]*\b': 'bitch',
}

## Mapping cho các emoji

In [59]:
emoji_adjective_map = {
    # Positive emotions
    "👍": "satisfied", "👍🏻": "satisfied", "👍🏼": "satisfied", "👍🏽": "satisfied", "👍🏾": "satisfied", "👍🏿": "satisfied",
    "😊": "happy", "😃": "joyful", "😀": "cheerful", "😄": "delighted",
    "😍": "in_love", "😘": "affectionate", "💘": "romantic",
    "👌": "perfect", "👌🏻": "perfect", "👌🏼": "perfect", "👌🏾": "perfect",
    "💯": "excellent", "🙌🏾": "excited", "😆": 'excited',

    # Negative emotions
    "👎": "disappointed", "👎🏻": "disappointed",
    "😔": "sad", "😞": "discouraged",
}

## Mapping cho các từ viết tắt, tiếng lóng

In [60]:
slang_dict = {
    "bc": "because",
    "ive": "i have",
    "lol": "laughing out loud",
    "lil": "little",
    "cuz": "because",
    "uv": "ultraviolet",
    "nyc": "new york city",
    "ur": "your",
    "luv": "love",
    "idk": "i do not know",
    "iam": "i am",
    "gf": "girlfriend",
    "ny": "new york",
    "omg": "oh my god",
    "fav": "favorite",
    "wtf": "what the fuck",
    "br": "but really",
    "ck": "check",
    "dr": "doctor",
    "btw": "by the way",
    "f@ch": "fuck",
    # Additional
    "ft": "feet",
    "qc": "quality control",
    "hrs": "hours",
    "vs": "versus",
    "sz": "size",
    "def": "definitely",
    "info": "information",
    "xl": "extra large",
    "ww": "wide width",
    "pros": "advantages",
    "cons": "disadvantages",
    
    # "wth": "what the hell", "wth": "with" # cái này t không biết làm sao để phân loại tốt 

    "dont": "do not", "don't": "do not",
    "doesnt": "does not", "doesn't": "does not",
    "didnt": "did not", "didn't": "did not",
    "cant": "cannot", "can't": "cannot",
    "couldnt": "could not", "couldn't": "could not",
    "shouldnt": "should not", "shouldn't": "should not",
    "wouldnt": "would not", "wouldn't": "would not",
    "wasnt": "was not", "wasn't": "was not",
    "isnt": "is not", "isn't": "is not",
    "arent": "are not", "aren't": "are not",
    "wont": "will not", "won't": "will not",
    "havent": "have not", "haven't": "have not",
    "hasnt": "has not", "hasn't": "has not",
    "hadnt": "had not", "hadn't": "had not",
    "neednt": "need not", "needn't": "need not",
    "mightnt": "might not", "mightn't": "might not",
    "mustnt": "must not", "mustn't": "must not",
    "shant": "shall not", "shan't": "shall not",
    "there's": "there is", "theres": "there is",
    "that's": "that is", "thats": "that is",
    
    "im": "i am", "i'm": "i am",
    "youre": "you are", "you're": "you are",
    "hes": "he is", "he's": "he is",
    "shes": "she is", "she's": "she is",
    "theyre": "they are", "they're": "they are",
    "its": "it is", "it's": "it is",
    "we're": "we are", "were": "we are",
    "thats": "that is", "that's": "that is",
    "whos": "who is", "who's": "who is",
}

# Định nghĩa các hàm cho các bước tiền xử lý

## Hàm sửa lỗi chính tả

In [61]:
ignore_words = {'crap', 'crappy'}

def correct_spelling(text):
    corrected_words = []
    for word in text.split():
        if word in ignore_words:
            corrected_words.append(word)  # giữ nguyên
        else:
            corrected_words.append(str(TextBlob(word).correct()))
    return " ".join(corrected_words)

## Hàm chuẩn hóa các dấu nháy về cùng một dạng

* Chuyển các dấu ’, `, ‘ về cùng một dấu là '

In [62]:
def normalize_quotes(text):
    return text.replace("’", "'").replace("`", "'").replace("‘", "'")

## Hàm chỉnh sửa giá trị phân số về thập phân

* Trong các đánh giá về giày thường có chỉ số về size, và đánh giá thường để chỉ số 1/2 thay vì 0.5
* Hàm này dùng để chuyển các giá trị phân số về giá trị thập phân

In [63]:
def convert_fractional_sizes(text):
    text = re.sub(r'(\d+)\s*1/2', lambda m: str(float(m.group(1)) + 0.5), text)
    text = re.sub(r'\b1/2\b', '0.5', text)
    return text

## Hàm loại bỏ các thành phần chưa xử lý từ HTML

* Các giá trị vừa gồm số và kí tự như @#34;... không có giá trị cho phân tích sẽ bị loại bỏ
* Các thẻ của HTML cũng được loại bỏ $\text{<br/>}$, $\text{<p>}$

In [64]:
def remove_html_tags_and_entities(text):
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'&#\d{2,4};', '', text)
    return text

## Hàm loại bỏ các chuỗi dài vừa có số vừa có kí tự

* Xóa các chuỗi dài, chứa cả số và kí tự
* Các chuỗi này thường là mã video, mã sản phẩm

In [65]:
def remove_custom_tags(text):
    return re.sub(r'\[\[.*?\]\]', '', text, flags=re.IGNORECASE)

## Hàm chuyển thành chữ thường và phân tách

In [66]:
def lowercase_and_strip(text):
    return text.lower().strip()

## Hàm hiển thị các từ ngữ thô tục thay vì các dấu *** 

In [67]:
def uncensor_profanity(text):
    for pattern, replacement in profanity_map.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

## Hàm thay thế emoji bằng các tính từ tương ứng

In [68]:
def replace_emoji(text):
    for emoji, adjective in emoji_adjective_map.items():
        text = text.replace(emoji, f" {adjective} ")
    return re.sub(r'\s+', ' ', text).strip()

## Hàm xóa các email

In [69]:
def remove_emails(text):
    return re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '', text)

## Hàm chuẩn hóa năm

* 1 yr old -> 1 year old
* yr -> year

In [70]:
def normalize_age(text):
    text = re.sub(r'(\d+)\s*yr[\.\- ]?old', r'\1 year old', text, flags=re.IGNORECASE)
    return re.sub(r'\byr\b', 'year', text)

## Hàm thay thế các từ viết tắt, từ lóng

In [71]:
def expand_slang_fuzzy(text):
    words = re.findall(r"\b\w+'\w+|\w+\b", text.lower())
    new_words = []

    for i, word in enumerate(words):
        if word == "u":
            if i > 0 and words[i - 1] == "the":
                new_words.append("u")
            else:
                new_words.append("you")
        elif word in slang_dict:
            new_words.append(slang_dict[word])
        else:
            # Fuzzy match if not in dictionary
            close_matches = difflib.get_close_matches(word, slang_dict.keys(), n=1, cutoff=0.8)
            if close_matches:
                new_words.append(slang_dict[close_matches[0]])
            else:
                new_words.append(word)

    return " ".join(new_words)

## Hàm lọc và giữ lại các kí tự là chữ, số và dấu chấm câu

In [72]:
def keep_only_text_numbers_dots(text):
    return re.sub(r"[^a-zA-Z0-9\.]", " ", text)

## Hàm chuẩn hóa khoảng trắng, xóa khoảng trắng và nối lại các từ tạo thành câu gọn gàng

In [73]:
def clean_whitespace(text):
    return " ".join(text.split())

## Hàm giảm các chữ lặp nhiều trong từ

In [74]:
def reduce_repeated_chars(text):
    return re.sub(r'(.)\1{2,}+', r'\1', text) # nếu chữ nào từ 3 lần lặp trở lên thì sẽ giảm xuống còn 1

## Hàm loại bỏ mã tracking / chuỗi khả nghi (dài >6, chứa cả số và chữ, không có khoảng trắng)

In [75]:
def remove_long_word(text):
    return re.sub(r'\b(?=\w*[a-zA-Z])(?=\w*\d)\w{7,}\b', '', text)

In [76]:
text = "this it is the thanking 1zrx2711yw12180891 hello gentlemen have not a problem with a package that is sent me it is with the direction i was not i am to send me this new direction please was not hello gentlemen have not a problem with speakers that is i bought my intesan much and would not like to call up and send it is to me this new direction mimi 4406 n w 74 ave mimi florida 33166"
print(remove_long_word(text))

this it is the thanking  hello gentlemen have not a problem with a package that is sent me it is with the direction i was not i am to send me this new direction please was not hello gentlemen have not a problem with speakers that is i bought my intesan much and would not like to call up and send it is to me this new direction mimi 4406 n w 74 ave mimi florida 33166


## Hàm loại bỏ số điện thoại

In [77]:
def remove_phone(text):
    return re.sub(r'(\+?\d{1,2}\s?)?(\(?\d{3}\)?[\s\-\.]?)?\d{3}[\s\-\.]?\d{4}\b', '', text)

# Module tổng tiền xử lý

In [None]:
def clean_text(text):
    text = html.unescape(text)

    text = normalize_quotes(text)

    text = remove_custom_tags(text)

    text = lowercase_and_strip(text)

    text = uncensor_profanity(text)

    text = expand_slang_fuzzy(text)

    text = remove_html_tags_and_entities(text)

    text = convert_fractional_sizes(text) 

    text = remove_emails(text)

    text = normalize_age(text)
    
    text = replace_emoji(text)

    text = reduce_repeated_chars(text)

    text = remove_long_word(text)

    text = remove_phone(text)

    text = keep_only_text_numbers_dots(text)

    text = clean_whitespace(text)
    
    text = correct_spelling(text)

    return text

In [79]:
df_train['cleaned_text'] = df_train['text'].progress_apply(clean_text)

  0%|          | 0/85076 [00:00<?, ?it/s]

100%|██████████| 85076/85076 [2:54:44<00:00,  8.11it/s]   


In [80]:
df_validation['cleaned_text'] = df_validation['text'].progress_apply(clean_text)

100%|██████████| 9738/9738 [15:53<00:00, 10.21it/s]  


In [81]:
df_train = df_train.drop(columns=['text'])
df_validation = df_validation.drop(columns= ['text'])

In [82]:
pd.DataFrame(df_train).to_csv('after_clean_train.csv', index= False)
pd.DataFrame(df_validation).to_csv('after_clean_test.csv', index= False)

# Tìm kiếm các từ có độ dài bất thường, lớn hơn threshold

## Threshold = 18

In [103]:
def contains_long_word(text, threshold=12):
    if pd.isna(text):
        return False
    # Tách các từ, kiểm tra độ dài
    return any(len(word) > threshold for word in text.split())

In [104]:
df_long_words = df_train[df_train['cleaned_text'].apply(lambda x: contains_long_word(x, threshold=18))]

In [105]:
df_long_words[['cleaned_text']].head(50)

Unnamed: 0,cleaned_text
53566,i saw this specific pair of balcony women s adhesion 8 running shoe advertised on facebook two weeks ago i had been searching for a running shoe that is had arch support and heel adding because i have not a heel spur that is just started bothering me anyway my mother in law got them for me for my with birthday i love how they are look and especially love how they are feel on my feet myfirstpairofsauconyshoes


### Sửa mẫu có độ dài token lớn hơn 18

* Index: 53566
* Sample trước khi chỉnh sửa: i saw this specific pair of balcony women s adhesion 8 running shoe advertised on facebook two weeks ago i had been searching for a running shoe that is had arch support and heel adding because i have not a heel spur that is just started bothering me anyway my mother in law got them for me for my with birthday i love how they are look and especially love how they are feel on my feet myfirstpairofsauconyshoes
* Sample sau khi chỉnh sửa: i saw this specific pair of balcony women s adhesion 8 running shoe advertised on facebook two weeks ago i had been searching for a running shoe that is had arch support and heel adding because i have not a heel spur that is just started bothering me anyway my mother in law got them for me for my with birthday i love how they are look and especially love how they are feel on my feet my first pair of saucony shoes

In [111]:
index_to_fix = 53566
df_train.at[index_to_fix, 'cleaned_text'] = df_train.at[index_to_fix, 'cleaned_text'].replace(
    'myfirstpairofsauconyshoes', 'my first pair of saucony shoes'
)

In [112]:
df_train.loc[index_to_fix]

labels                                                                                                                                                                                                                                                                                                                                                                                                                                  Positive
cleaned_text    i saw this specific pair of balcony women s adhesion 8 running shoe advertised on facebook two weeks ago i had been searching for a running shoe that is had arch support and heel adding because i have not a heel spur that is just started bothering me anyway my mother in law got them for me for my with birthday i love how they are look and especially love how they are feel on my feet my first pair of saucony shoes
Name: 53566, dtype: object

## Threshold = 15

In [98]:
df_long_words = df_train[df_train['cleaned_text'].apply(lambda x: contains_long_word(x, threshold=15))]

In [None]:
index_to_fix_15 = []

In [99]:
df_long_words

Unnamed: 0,labels,cleaned_text
140,Negative,i love rainbow but these are not them i have not purchased authentic rainbow in the past and the log on these does not not match nor does not the material the log in these pictures does not not even match the ones i received on top of that is i received a very aggressive email from the seller saying that is i am a liar and i should not getmyfactsstraight i am sorry but that is completely inappropriate if i wanted knock off i would not buy them in china town for 5 not on amazon we are i expect quality items only
1093,Negative,this she is are totally disproportionate
2866,Negative,i ordered a w the box said w the boots said 9 no w and they are are actually narrowed than a normal feet so very disappointed with that is bit i am not sure if the boot just does not show the w or if the wrong pair we are put in the box other than being narrow and the extra wide calf opening no longer being extra wide they are are a nice looking boot so mostly the feet and the misrepresentation it is my have not
4997,Positive,i really really did not was not to buy these at full price but i finally broke down and i am glad i plunged i have ended up wearing these several times a week and they are supercute and supercomfortable and they are pretty enough to pass for dress in my world i also got a pair in black at half price but the red ones make me so much happier but really but really i walk about i have miles per day and have not widest feet with middling arches i usually wear a size 10 medium in non dans she is 41 in lanskoy
5497,Positive,i bought these to replace a similar pair i had bought from costo for 26 00 that is we are lost these are indistinguishable form the lost pair comfortable and an excellent value
8684,Positive,i used the shoe dog on roadrunnersports to find the pair of running she is that is would not be best for me and then purchased them from amazon they are suggested buying these 1 2 size larger glad i did they are do run a little tight i have been wearing them everyday for a week and they are very comfortable i am not a runner they are just my everyday shoe
10057,Negative,this was the worst shoe i cannot even walk few step also so hard and too narrow though i wear 7 still it is seems like no 5 it is completlydifferent from the picture
20110,Negative,i could not begin to get my 9 d foot in this 9 d shoe probably because they are we are made in china an adult chinese 9 d foot it is apparentlysomewhat smaller that is the the adult us 9 d foot and the shoe was believe it is or not about a half size too long it is looked like a chinese gun boat never again
20271,Neutral,these boots look fantastic but as the saying goes you get what you pay for i tried on a different make of boot that is looked identical but cost 350 and they are feet so well and felt amazing on my feet i cannot already tell the left boot it is different than the right and has a spot that is it is rubbing into the ankle i am a tall lean guy but the tops of the boots are very wide even if you had calves like schwarzenegger they are would not be too big i tried wearing them with straight leg parts not jeans and they are bulge out making the leg look disproportionate all this being said i ll just have not to wear the right parts and only wear them for a few hours at a time no line dancing and if i follow that is routine i ll get my 140 out of them
20777,Positive,helped with my plantarfasciitis good bit it is it is also comfortable in glimmer styled she is too


### Các cụm cần sửa

* Index: 140  | getmyfactsstraight -> get my facts straight 
* Index: 4997 | supercute -> super cute, supercomfortable -> super comfortable
* Index: 10057| completlydifferent -> completely different
* Index: 20110| apparentlysomewhat -> apparently somewhat
* Index: 20777| plantarfasciitis -> plantar fasciitis
* Index: 34777| uncomfortability -> uncomfortableness
* Index: 62935| http wide width amazon com up bag details red bag m is i have utf sin isamazonfulfilled 0 scab -> Delete
* Index: 69713| itemswerenotshoes -> items were not shoes
* Index: 71246| notificacionescps -> notifications
* Index: 75824| quitecomfortable -> quite comfortable
* Index: 81603| consumerwarranty -> consumer warranty

In [117]:
replacements = {
    140: "getmyfactsstraight -> get my facts straight",
    4997: "supercute -> super cute, supercomfortable -> super comfortable",
    10057: "completlydifferent -> completely different",
    20110: "apparentlysomewhat -> apparently somewhat",
    20777: "plantarfasciitis -> plantar fasciitis",
    34777: "uncomfortability -> uncomfortableness",
    62935: "http wide width amazon com up bag details red bag m is i have utf sin isamazonfulfilled 0 scab -> Delete",
    69713: "itemswerenotshoes -> items were not shoes",
    71246: "notificacionescps -> notifications",
    75824: "quitecomfortable -> quite comfortable",
    81603: "consumerwarranty -> consumer warranty"
}

In [118]:
def apply_replacement(df, replacements):
    for idx, repl in replacements.items():
        if pd.isna(df.at[idx, 'cleaned_text']):
            continue  # bỏ qua nếu dòng đó rỗng

        pairs = [s.strip() for s in repl.split(',')]
        text = df.at[idx, 'cleaned_text']
        for pair in pairs:
            if '->' in pair:
                old, new = [s.strip() for s in pair.split('->')]
                if new.lower() == 'delete':
                    text = text.replace(old, '')  # Xóa cụm từ
                else:
                    text = text.replace(old, new)
        df.at[idx, 'cleaned_text'] = text

    return df

In [119]:
df_train = apply_replacement(df_train, replacements)

In [120]:
df_long_words = df_train[df_train['cleaned_text'].apply(lambda x: contains_long_word(x, threshold=15))]
df_long_words

Unnamed: 0,labels,cleaned_text
1093,Negative,this she is are totally disproportionate
2866,Negative,i ordered a w the box said w the boots said 9 no w and they are are actually narrowed than a normal feet so very disappointed with that is bit i am not sure if the boot just does not show the w or if the wrong pair we are put in the box other than being narrow and the extra wide calf opening no longer being extra wide they are are a nice looking boot so mostly the feet and the misrepresentation it is my have not
5497,Positive,i bought these to replace a similar pair i had bought from costo for 26 00 that is we are lost these are indistinguishable form the lost pair comfortable and an excellent value
8684,Positive,i used the shoe dog on roadrunnersports to find the pair of running she is that is would not be best for me and then purchased them from amazon they are suggested buying these 1 2 size larger glad i did they are do run a little tight i have been wearing them everyday for a week and they are very comfortable i am not a runner they are just my everyday shoe
20271,Neutral,these boots look fantastic but as the saying goes you get what you pay for i tried on a different make of boot that is looked identical but cost 350 and they are feet so well and felt amazing on my feet i cannot already tell the left boot it is different than the right and has a spot that is it is rubbing into the ankle i am a tall lean guy but the tops of the boots are very wide even if you had calves like schwarzenegger they are would not be too big i tried wearing them with straight leg parts not jeans and they are bulge out making the leg look disproportionate all this being said i ll just have not to wear the right parts and only wear them for a few hours at a time no line dancing and if i follow that is routine i ll get my 140 out of them
23244,Neutral,i have not tried out these boots for two months nonconsecutively these boots are good but not great the first thing it is that is they are it is lots of coverage material with these boots but i have not not walked in heavy rain yet the flaps that is connect the young might not experience problems with folding so adjust them when you wear the boots the boots have not damage resistant parts in the front and back of the boot they are are not air circulation friendly they are do not provide good arch support this needs to be reiterated even if you use custom involves they are only i have a foam sole and nothing more they are it is no boot stabilization either the boot it is walker friendly as the sole curves with you are steps the top hooks might not move not this it is not a big issue these factors make the boot good but not great i would not recommend buying a hard shoe sole such as power step if arch support it is needed please let me know if this was helpful
23368,Neutral,not true feet and it is was a little disproportionate both she is look like they are are going a little to the left overall conan color looks great without capered or modern feet slack or parts
24281,Neutral,i liked how spacious it is it is and how compartmentalized it is it is but i was not so fond of the blue color so i gave it is to my mon who is absolutely loved the color amount of space and all the pockets
24536,Negative,the biggest problem i had was working out what the codes for fitting width actually meant why are they are so incomprehensible
30826,Positive,this product changed my life i have not a lot of shallow scars a few boxer scars and a lot of hyperpigmentation from old scars i have not tried dermaneedling for two years and did not see much change and was going try laser resurfacing it is but sounded too risky and expensive since i did not was not to damage my the good parts of my skin i did not think chemical feels would not make a huge difference so i never thought much of it is until i stumbled upon reading about different feels online i bought and used the glycolic acid 40 two weeks earlier and that is did not do anything but make my face more smooth the next day i decided to try purchase the tea 20 and and see if it is would not make a difference i was resistant after reading stories of people having bad experiences with tea feels but i finally got up the nerve to try it is i have not asia skin and read i should not be more cautious which it is why i stuck with 20 and nothing higher it is was not bad at all compared to the burn with the glycolic acid 40 for some reason maybe it is propped my skin but really but really i applied it is after meticulously cleaning my face with diluted rubbing alcohol to make the alcohol fumes more tolerable i then applied this product for about 3 minutes and only got to really light crossing and did not continue with another layer since for my first time i wanted to be on the safe side my skin got really tight feeling but was not red the next morning it is did not really look at all different than before i did the feel since i only had one week before work started i did not have not much time to wait it is out a week or so and see the results which i felt weren t going to be dramatic because i did not really get a good frost i decided to apply the feel again the next afternoon i researches and was convinced it is was ok since i know my skin well and knew i could not handle it is not everyone cannot handle this so this it is not suggested i left it is on for the full 5 minutes and it is burned more than the previous night i also frosted morethan the previous night but not completely white and could not see my scars turning a little red so i neutralised and rinsed with water surprisingly the second time it is did not feel as tight i applied aquaphor kept it is moist for at least 8 hours at time for a week while it is peeled and did not have not new york discomfort my skin turned darker during the healing process but looked fine otherwise the fun part was to see it is feel and see the fresh skin underneath but really but really the edges of my shallow scars have not softened and the hyperpigmentation it is still they are but has lightened in some areas my skin looks so much more refreshed and softer this it is the first time in the past fifteen years that is i feel confident not wearing make i am only using some tinted sunscreen as a moisturizer and concealed in two or three spots i plan to do this feel in another two months cannot wait to the see the results the scars have not completely disappeared but it is gave me a second chance and most of all taught me to not pick at my face so i do not ruin my results so happy with the results so far life changes


## Thực hiện tương tự trên tập test

In [124]:
df_long_words = df_validation[df_validation['cleaned_text'].apply(lambda x: contains_long_word(x, threshold=15))]
df_long_words

Unnamed: 0,labels,cleaned_text
362,Negative,not worth the headache of working with shoegistic i would not avoid them in the future and beware of their unprofessionalism
5170,Negative,my first pair came from macysamazontfitcwas awesome i purchased this pair on a daily deal from amazon these she is do feet narrow which i like since i have not narrow feet but the she is i purchased from amazon are defective the right shoe it is overlay tight and has a knot inside the shoe that is it is not able to be removed this it is the third pair of defective she is that is i have not purchased from amazon definitely another return
7904,Positive,they are really decent for the price not super top end nor would not you expect it is to be for the price but cheaper than anything on discountdancesupply and certainly cheaper than anything in person if you need not a basic class shoe that is does not run way too small and has free shipping this it is solid


In [125]:
index_to_fix = 5170

df_validation.at[5170, 'cleaned_text'] = df_validation.at[5170, 'cleaned_text'].replace(
    'macysamazontfitcwas', 'macys amazon fit was'
)

In [126]:
df_validation.loc[5170]

labels                                                                                                                                                                                                                                                                                                                                                                                                                                                           Negative
cleaned_text    my first pair came from macys amazon fit was awesome i purchased this pair on a daily deal from amazon these she is do feet narrow which i like since i have not narrow feet but the she is i purchased from amazon are defective the right shoe it is overlay tight and has a knot inside the shoe that is it is not able to be removed this it is the third pair of defective she is that is i have not purchased from amazon definitely another return
Name: 5170, dtype: object

# Loại bỏ các stopword theo đề xuất của nhóm

In [137]:
custom_stopwords = {
    # Đại từ cá nhân và sở hữu
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
    'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself',
    'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves',

    # Đại từ nghi vấn
    'what', 'which', 'who', 'whom', 'whose',

    # Mạo từ, giới từ
    'a', 'an', 'the', 'of', 'in', 'on', 'at', 'to', 'for', 'from', 'by',
    'with', 'about', 'as', 'into', 'through', 'over', 'under', 'above', 'below',
    'between', 'during', 'before', 'after', 'up', 'out', 

    # Từ nối, trạng từ không mang cảm xúc
    'and', 'or', 'but', 'if', 'while',
    'also', 'either', 'both', 'each', 'every', 'some', 'few', 'any',
    'this', 'that', 'these', 'those', 

    # Trợ động từ trung tính
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'do', 'does', 'did', 'doing',
    'have', 'has', 'had', 'having',
    'can', 'could', 'should', 'would', 'may', 'might', 'will',

    # Từ phụ trợ không mang giá trị nội dung
    'just', 'like', 'get', 'got', 'use', 'used', 'using',
    'still', 'ever',
    
    # Một số từ phụ thường lặp nhiều lần
    'one', 'two', 'three', 'also', 'etc'
}


In [138]:
def remove_custom_stopwords(text, stopwords_set):
    return ' '.join([word for word in text.split() if word.lower() not in stopwords_set])

In [139]:
df_train['no_stopwords'] = df_train['cleaned_text'].apply(lambda x: remove_custom_stopwords(x, custom_stopwords))
df_validation['no_stopwords'] = df_validation['cleaned_text'].apply(lambda x: remove_custom_stopwords(x, custom_stopwords))

In [140]:
df_train

Unnamed: 0,labels,cleaned_text,no_stopwords
0,Positive,good shoe for office work they are will stuff very easy so be aware,good shoe office work stuff very easy so aware
1,Negative,i have not had the patrick ii wedge in black for about 1 year wore them regularly in season when i saw the patrick at a good price in navy from pm i purchased them because i thought they are would not feet just like my patrick his i was wrong and paid the price with return shipping that is pm does not pay but really but really the cross webster says that is cross are not suppose to feet like other scandals they are are suppose to be loose thus more comfortable i normally wear an 8 1 2 so have not now tried both an 8 a 9 in the patrick shoe i have not an 8 in the patrick ii the patrick 9 swims on my feet they are would not be a hazard to walk around in the size 8 it is my left foot which it is my wider foot but it is too narrow on my right foot when i placed the she is sole to sole i did notice a slight difference in the width which apparently my foot notices too i cannot only conclude a manufacturing defect but it is it is this shoe specifically or the form for this shoe there is have not written the shoe it is narrow but really but really consequently if you have not a wider foot order the patrick ii instead of this one if you are a 1 2 size order down not up,not patrick ii wedge black 1 year wore regularly season when saw patrick good price navy pm purchased because thought not feet patrick wrong paid price return shipping pm not pay really really cross webster says cross not suppose feet other scandals suppose loose thus more comfortable normally wear 8 1 2 so not now tried 8 9 patrick shoe not 8 patrick ii patrick 9 swims feet not hazard walk around size 8 left foot wider foot too narrow right foot when placed sole sole notice slight difference width apparently foot notices too cannot only conclude manufacturing defect shoe specifically form shoe there not written shoe narrow really really consequently not wider foot order patrick ii instead 1 2 size order down not
2,Negative,width not right and size too small if width had been just little wider and ordered size larger would not have not been good loved the shoe look,width not right size too small width little wider ordered size larger not not good loved shoe look
3,Negative,i received these she is and they are weren t the same as the picture described them they are we are a different color when i tried to return them the shipping was not paid for so i had to pay 20 for shipping a was not of time and money i do not recommend anyone to buy from thesmartbuy,received weren t same picture described different color when tried return shipping not paid so pay 20 shipping not time money not recommend anyone buy thesmartbuy
4,Neutral,they are began to split alone the mesh material after a month but loved the shoe and the feel of it is,began split alone mesh material month loved shoe feel
...,...,...,...
85071,Neutral,i wear a size 7 in all my she is but this one was too large hurt the back of my feet because they are we are too big,wear size 7 all too large hurt back feet because too big
85072,Positive,love the sunglasses love the look love the polarized lenses love the price but really however the lenses have not popped out twice once from a 5 foot fall from my head on a carpeted floor another sitting in the case in my grm bag i remained customer support and never received a response would not have not been a 5 star but loses a star for poor customer service,love sunglasses love look love polarized lenses love price really however lenses not popped twice once 5 foot fall head carpeted floor another sitting case grm bag remained customer support never received response not not 5 star loses star poor customer service
85073,Negative,we are comfortable the st time but seem to get tighter everytime i wore them red many compliments on them but cannot no longer wear them,comfortable st time seem tighter everytime wore red many compliments cannot no longer wear
85074,Negative,hurts my feet like wearing razor blades the material it is too scratch and i cannot even wear them even tried with those on but they are are awful i wore them to work one day i stay at a desk mostly and could not even tolerate wearing them sitting down they are are attractive but such,hurts feet wearing razor blades material too scratch cannot even wear even tried awful wore work day stay desk mostly not even tolerate wearing sitting down attractive such


In [141]:
df_train.to_csv('after_clean_train.csv', index=False)
df_validation.to_csv('after_clean_test.csv', index=False)