## 1.Import Library

In [162]:
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
from pyvi import ViTokenizer

## 2.Explore Data Analysis

In [163]:
df_train = pd.read_csv('./DataPhone/Train.csv')
df_test=pd.read_csv('./DataPhone/Test.csv')

In [164]:
df_train.columns

Index(['index', 'comment', 'n_star', 'date_time', 'label'], dtype='object')

In [165]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,M·ªõi mua m√°y n√†y T·∫°i thegioididong th·ªët n·ªët c·∫£m...,5,2 tu·∫ßn tr∆∞·ªõc,{CAMERA#Positive};{FEATURES#Positive};{BATTERY...
1,1,Pin k√©m c√≤n l·∫°i mi·ªÖn ch√™ mua 8/3/2019 t√¨nh tr·∫°...,5,14/09/2019,{BATTERY#Negative};{GENERAL#Positive};{OTHERS};
2,2,Sao l√∫c g·ªçi ƒëi·ªán tho·∫°i m√†n h√¨nh b·ªã ch·∫•m nh·ªè nh...,3,17/08/2020,{FEATURES#Negative};
3,3,"M·ªçi ng∆∞·ªùi c·∫≠p nh·∫≠t ph·∫ßn m·ªÅm l·∫°i , n√≥ s·∫Ω b·ªõt t·ªë...",3,29/02/2020,{FEATURES#Negative};{BATTERY#Neutral};{GENERAL...
4,4,"M·ªõi mua S√†i ƒë∆∞·ª£c 1 th√°ng th·∫•y pin r·∫•t tr√¢u, S√†...",5,4/6/2020,{BATTERY#Positive};{PERFORMANCE#Positive};{SER...


### 2.1.Overview of phone dataset

> The dataset consists of 11,122 comments, including of four features: 

>comment: Commentary content.
 n_star: The user evaluates the smartphone's star.

>data_time: The date and time the comment was posted.

>label: Label of comment.

>All samples are in text format. No tokenization has been applied. Users of this dataset are free to use whatever sentence representation they choose.

### 2.2 Structure of dataset

In [166]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7786 entries, 0 to 7785
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   index      7786 non-null   int64 
 1   comment    7786 non-null   object
 2   n_star     7786 non-null   int64 
 3   date_time  7786 non-null   object
 4   label      7786 non-null   object
dtypes: int64(2), object(3)
memory usage: 304.3+ KB


### 2.3 Summary of statistic

In [167]:
df_train.describe()

Unnamed: 0,index,n_star
count,7786.0,7786.0
mean,3892.5,3.706011
std,2247.768931,1.508531
min,0.0,1.0
25%,1946.25,3.0
50%,3892.5,4.0
75%,5838.75,5.0
max,7785.0,5.0


### 2.4. Preprocessing and Visualization of the dataset

In [168]:
print(df_train['comment'].values[95:98])

['ƒêi·ªán tho·∫°i kh√° t·ªët pin tr√¢u kh√° m∆∞·ª£t b·∫Øt wifi c·ª±c t·ªët ch·ªâ l√† th·ªânh tho·∫£ng m√°y b·ªã ƒë∆° ph·∫£i tho√°t ra v√†o l·∫°i v√† m√°y ko c·∫≠p nh·∫≠t l√™n miul 12'
 'L√∫c tr∆∞·ªõc nghe b·∫£o ƒëi·ªán tho·∫°i ch∆°i game n√≥ng, nh∆∞ng mua v·ªÅ chi·∫øn li√™n qu√¢n 3 4h li√™n t·ª•c th√¨ ch·ªâ th·∫•y ·∫•m kh√¥ng n√≥ng nh∆∞ l·ªùi ƒë·ªìn, nh·ªØng th·ª© kh√°c th√¨ v≈©ng ch·∫≥ng c√≥ g√¨ ƒë·ªÉ n√≥i v√¨ qu√° ngon r·ªìi'
 '1. M√°y n·∫øu kh√¥ng ch∆°i game, l∆∞·ªõt web..., √≠t s·ª≠ d·ª•ng th√¨ gi·ªØ Pin cao nh·∫•t ƒë∆∞·ª£c kho·∫£ng 1 ng√†y 16 ti·∫øng ( T√≠nh lu√¥n c·∫£ l√∫c ng·ªß nh√© v√¨ khi ng·ªß th√¨ ch·∫≥ng ƒë·ªông g√¨ t·ªõi ) . Kh√¥ng ƒë∆∞·ª£c 2 ng√†y nh∆∞ mn n√≥i ƒë√¢u \n2. Nh·∫°c nghe c·ª© c√† gi·∫≠t c√† gi·∫≠t r·∫•t kh√≥ ch·ªãu lu√¥n\n3. Quay video b·ªã m·ªù ch√°n \nC√≤n l·∫°i okie üôÜ']


> Handle punctuation, handle whitespace, handle icons in strings

In [169]:
def remove_pucntuation(comment):
  # Create a translation table
  translator = str.maketrans('', '', string.punctuation)
  # Remove punctuation
  new_string = comment.translate(translator)
  # Remove redudant space and break sign
  new_string = re.sub('[\n ]+', ' ', new_string)
  # Remove emoji icon
  emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)
  new_string = re.sub(emoji_pattern, '', new_string)

  return new_string

>Read  stop words file 

In [170]:
def read_filestopwords():
    with open('./DataPhone/vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
        lines = file.readlines()
        words = [line.split('\n')[0] for line in lines]
    return words

> Remove stop words

In [171]:
def remove_stopword(comment):
  stop_words = read_filestopwords()
  filtered = [word for word in comment.split() if word not in stop_words]
  return ' '.join(filtered)

> Execute function

In [172]:
df_train['comment'] = df_train['comment'].apply(lambda x: x.lower())
df_train['comment'] = df_train['comment'].apply(remove_pucntuation)
df_train['comment'] = df_train['comment'].apply(remove_stopword)
df_test['comment'] = df_test['comment'].apply(lambda x: x.lower())
df_test['comment']= df_test['comment'].apply(remove_pucntuation)
df_test['comment']= df_test['comment'].apply(remove_stopword)

> Tokenizer 

In [173]:
df_train['comment'] = df_train['comment'].apply(lambda x: ViTokenizer.tokenize(x))
df_test['comment']= df_test['comment'].apply(lambda x: ViTokenizer.tokenize(x))

In [174]:
df_train['comment'].values[95:98]

array(['ƒëi·ªán_tho·∫°i kh√° t·ªët pin tr√¢u kh√° m∆∞·ª£t b·∫Øt wifi c·ª±c t·ªët th·ªânh_tho·∫£ng m√°y ƒë∆° tho√°t m√°y ko c·∫≠p_nh·∫≠t miul 12',
       'nghe b·∫£o ƒëi·ªán_tho·∫°i ch∆°i game n√≥ng mua v·ªÅ chi·∫øn li√™n_qu√¢n 3 4h li√™n_t·ª•c th·∫•y ·∫•m n√≥ng l·ªùi ƒë·ªìn th·ª© kh√°c v≈©ng ch·∫≥ng n√≥i qu√° ngon',
       '1 m√°y ch∆°i game l∆∞·ªõt web √≠t s·ª≠_d·ª•ng gi·ªØ pin cao nh·∫•t kho·∫£ng 1 ng√†y 16 ti·∫øng t√≠nh lu√¥n c·∫£ ng·ªß nh√© ng·ªß ch·∫≥ng ƒë·ªông t·ªõi 2 ng√†y mn n√≥i ƒë√¢u 2 nh·∫°c nghe c√†_gi·∫≠t c√†_gi·∫≠t kh√≥_ch·ªãu lu√¥n 3 quay video m·ªù ch√°n c√≤n okie'],
      dtype=object)

> Observing, we can see that there are 3 types of labels: neutral, positive and negative

In [175]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,{CAMERA#Positive};{FEATURES#Positive};{BATTERY...
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,{BATTERY#Negative};{GENERAL#Positive};{OTHERS};
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,{FEATURES#Negative};
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,{FEATURES#Negative};{BATTERY#Neutral};{GENERAL...
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,{BATTERY#Positive};{PERFORMANCE#Positive};{SER...


> Function count how many characteristics belong to the labels positive, negative, neutral

In [176]:
def count_positive_labels(label):
    return label.count("Positive")
def count_neural_labels(label):
    return label.count("Neutral")
def count_negative_labels(label):
    return label.count("Negative")

> Execute function

In [177]:
df_train['positive_count']=df_train['label'].apply(count_positive_labels)
df_train['neutral_count']=df_train['label'].apply(count_neural_labels)
df_train['negative_count']=df_train['label'].apply(count_negative_labels)
df_test['positive_count']=df_test['label'].apply(count_positive_labels)
df_test['neutral_count']=df_test['label'].apply(count_neural_labels)
df_test['negative_count']=df_test['label'].apply(count_negative_labels)

> My labeling idea is that if there are many features that fall into positive then assign positive, if the most features fall into negative then assign negative, if negative is equal to neutral then assign negative, if negative is equal to positve is assigned positive, the remaining cases are assigned neutral

In [178]:
def assign_label(row):
    if row['positive_count'] > row['neutral_count'] and row['positive_count'] > row['negative_count']:
        return 'Positive'
    elif row['negative_count'] >row['neutral_count'] and row['negative_count'] > row['positive_count']:
        return 'Negative'
    elif row['negative_count'] == row['neutral_count'] :
        return 'Negative'
    elif row['neutral_count']== row ['positive_count']:
        return "Positive"
    else :
        return "Neutral"

>Execute function

In [179]:
df_train['label'] = df_train.apply(assign_label, axis=1)
df_test['label'] =df_test.apply(assign_label,axis=1)

In [180]:
df_train.head(8)

Unnamed: 0,index,comment,n_star,date_time,label,positive_count,neutral_count,negative_count
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,Positive,6,0,0
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral,1,0,1
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative,0,0,1
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral,0,2,1
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive,2,0,1
5,5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive,3,2,0
6,6,m√¨nh m·ªõi x√†i 7 th√°ng xu·ªëng 7 pin ch·∫£ hi·ªÉu m√°y ...,1,1 tu·∫ßn tr∆∞·ªõc,Negative,0,0,1
7,7,h√¥m ng√†y 2362020 e th·∫ø_gi·ªõi di_ƒë·ªông mua dthoai...,2,23/06/2020,Negative,0,0,1


> 

> Remove unnecessary columns after labeling

In [181]:
list_remove=['positive_count','neutral_count','negative_count']
df_train.drop(list_remove,axis=1,inplace=True)
df_test.drop(list_remove,axis=1,inplace=True)

In [182]:
df_train.columns

Index(['index', 'comment', 'n_star', 'date_time', 'label'], dtype='object')

In [183]:
df_train.shape

(7786, 5)

>observed that there is asynchronous data in the datetime column

In [184]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,Positive
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive


> Function Check datetime

In [185]:
def is_valid_date(date_str):
    # Bi·ªÉu th·ª©c ch√≠nh quy ƒë·ªÉ ki·ªÉm tra ƒë·ªãnh d·∫°ng ng√†y th√°ng
    date_pattern = r'\d{1,2}/\d{1,2}/\d{4}'
    return bool(re.match(date_pattern, date_str))

> Execute function

In [186]:
valid_dates = df_train[df_train['date_time'].apply(is_valid_date)]
valid_dates_test=df_test[df_test['date_time'].apply(is_valid_date)]

> Observing we see that there are 6930 valid values

In [187]:
valid_dates.shape

(6930, 5)

In [188]:
valid_dates.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive
5,5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive


In [189]:
invalid_dates=df_train[~df_train['date_time'].apply(is_valid_date)]
invalid_dates_test=df_test[~df_test['date_time'].apply(is_valid_date)]

> There are 856 values in the datetime column out of a total of 6930 that have inconsistent formatting

In [190]:
invalid_dates.shape

(856, 5)

In [191]:
invalid_dates.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,2 tu·∫ßn tr∆∞·ªõc,Positive
6,6,m√¨nh m·ªõi x√†i 7 th√°ng xu·ªëng 7 pin ch·∫£ hi·ªÉu m√°y ...,1,1 tu·∫ßn tr∆∞·ªõc,Negative
11,11,kh√° ·ªïn trong t·∫ßm gi√° cam ƒë·∫πp s·∫°c nhanh m√†n_h√¨n...,5,3 tu·∫ßn tr∆∞·ªõc,Positive
14,14,m√¨nh mua dc 1 tu·∫ßn m√°y ph√°t tr·ª±c_ti·∫øp t·ª±_nhi√™n...,1,6 ng√†y tr∆∞·ªõc,Negative
18,18,d√πng ƒë∆° m√°y b·∫•m m√£i m·ªõi kh·ªüi_ƒë·ªông song n√≥ng ra...,1,5 ng√†y tr∆∞·ªõc,Negative


> Fill in the common value for that attribute

In [192]:
common_value=valid_dates['date_time'].mode()[0]
common_value_test=valid_dates_test['date_time'].mode()[0]
invalid_dates.loc[:, 'date_time'] = common_value
invalid_dates_test.loc[:,'date_time']=common_value

In [193]:
invalid_dates.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
0,0,m·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok...,5,21/04/2020,Positive
6,6,m√¨nh m·ªõi x√†i 7 th√°ng xu·ªëng 7 pin ch·∫£ hi·ªÉu m√°y ...,1,21/04/2020,Negative
11,11,kh√° ·ªïn trong t·∫ßm gi√° cam ƒë·∫πp s·∫°c nhanh m√†n_h√¨n...,5,21/04/2020,Positive
14,14,m√¨nh mua dc 1 tu·∫ßn m√°y ph√°t tr·ª±c_ti·∫øp t·ª±_nhi√™n...,1,21/04/2020,Negative
18,18,d√πng ƒë∆° m√°y b·∫•m m√£i m·ªõi kh·ªüi_ƒë·ªông song n√≥ng ra...,1,21/04/2020,Negative


> combine invalid_dates and valid_dates

In [194]:
df_train=pd.concat([valid_dates,invalid_dates])
df_test=pd.concat ([valid_dates_test,invalid_dates_test])

In [195]:
df_train.shape

(7786, 5)

In [196]:
df_train.head(5)

Unnamed: 0,index,comment,n_star,date_time,label
1,1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral
2,2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative
3,3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral
4,4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive
5,5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive


>Remove index column

In [197]:
df_train.drop("index",axis=1,inplace=True)
df_test.drop("index",axis=1,inplace=True)

In [198]:
df_train.shape

(7786, 4)

In [199]:
df_train.head(5)

Unnamed: 0,comment,n_star,date_time,label
1,pin k√©m c√≤n mi·ªÖn ch√™ mua 832019 t√¨nh_tr·∫°ng pin...,5,14/09/2019,Neutral
2,sao g·ªçi ƒëi·ªán_tho·∫°i m√†n_h√¨nh ch·∫•m nh·ªè nh√°y g·∫ßn ...,3,17/08/2020,Negative
3,m·ªçi ng∆∞·ªùi c·∫≠p_nh·∫≠t ph·∫ßn_m·ªÅm n√≥ b·ªõt t·ªën pin m√¨n...,3,29/02/2020,Neutral
4,m·ªõi mua s√†i 1 th√°ng th·∫•y pin tr√¢u s√†i bao m∆∞·ª£t...,5,4/6/2020,Positive
5,x√†i t·ªët m∆∞·ª£t pin tr√¢u b·∫°n ƒë·ªô s√°ng ƒë·ªß nh√¢n_vi√™n...,5,20/06/2019,Positive


In [200]:
df_train['comment'][0]

'm·ªõi mua m√°y thegioididong th·ªët_n·ªët c·∫£m_th·∫•y ok bin tr√¢u ch·ª•p ·∫£nh ƒë·∫πp loa nghe to b·∫Øt wf kh·ªèe_s√≥ng ·ªïn_ƒë·ªãnh gi√°_th√†nh t√∫i_ti·ªÅn nh√¢n_vi√™n t∆∞_v·∫•n nhi·ªát_t√¨nh'

> Dump file CSV after data processing

In [201]:
df_train.to_csv("./DataPhone/trainprocessed.csv", index=False)
df_test.to_csv("./DataPhone/testprocesssed.csv",index=False)