### Sentiment Analysis - Version 02 (TFIDF-VADER)

+ Data source: MyVIB1, MyVIB2 in Android and iOS

Data Processing:
+ Vietnamese: correct typo by looking up the dictionary
+ Vietnamese: count the negative sentiment word in review sentence
+ Vietnamese to English: Google translate
+ English:    clean english text data (remove stopwords, etc.)

Feature engineer:
+ TF-IDF : get 1000 features
+ VADER  : get vader compound score
+ Negative Word: get potential mark of negative sentiment 


TRAIN:
+ Feature: TF-IDF, VADER, NEGATIVE_WORD | LABEL (RATING<4|NEGATIVE_WORD>0)




In [25]:
import json
import pandas as pd
import numpy as np

from tqdm import tqdm
from string import punctuation
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import groupby
import sys

import regex as re
from deep_translator import GoogleTranslator

from collections import Counter

import textacy.preprocessing.normalize as tprep
from textacy.preprocessing.remove import accents
import string

import nltk
from nltk.corpus import opinion_lexicon,wordnet,stopwords
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
import os
from datetime import date
import datetime
import time
import shutil
import os

import enchant
dict_english = enchant.Dict("en_US")

#from TopicModeling import *
import pickle

# Android
from google_play_scraper import Sort, reviews, app, reviews_all

# iOS
from app_store_scraper import AppStore

In [26]:
# included code
import sys
sys.path.append('./mylib')
import global_materials
from load import *
from sentiment_process import *

stopwords = global_materials.stopwords

#### FOLDER STRUCTURE

In [27]:
today = date.today()
TRAIN_ROOT = './data/train'

if not os.path.exists("model"):
    os.mkdir("model")
if not os.path.exists("data"):
    os.mkdir("data")
if not os.path.exists(TRAIN_ROOT):
    os.mkdir(TRAIN_ROOT)
if not os.path.exists(TRAIN_ROOT+"/translate_"+str(today)):
    os.mkdir(TRAIN_ROOT+"/translate_"+str(today))

#### DATA COLLECTION
**1) ANDROID**

In [6]:

rw_android_src_dict = {'vib1':['com.vn.vib.mobileapp','vi','vn'],
                       'vib2':['com.vib.myvib2','vi','vn'],
                       'tcb':['vn.com.techcombank.bb.app','vi','vn'],
                       'tpb':['com.tpb.mb.gprsandroid','vi','vn'],
                       'bid':['vn.com.bidv.ibank','vi','vn'],
                       'vcb':['com.VCB','vi','vn'],
                       'mb':['com.mbcorp','vi','vn']
                       }
#----------------------------------------------------------------------------------------------------------
# get Android review
column = ['reviewId','userName','userImage','content','score','thumbsUpCount','reviewCreatedVersion','at','replyContent','repliedAt','bankapp']
df_android_rw = pd.DataFrame(columns = column)

get_review_in = ['vib1','vib2'] #rw_android_src_dict.keys()

for bank_name in get_review_in:
    Andrw_df = get_rwdata_android(rw_android_src_dict, bank_name, save_path=TRAIN_ROOT)   
    Andrw_df['bankapp'] = bank_name
    df_android_rw = pd.concat([df_android_rw, Andrw_df])
    time.sleep(30)
    
df_android_rw.reset_index(drop=True, inplace=True)
df_android_rw.to_csv(TRAIN_ROOT+'/1_AndRaw_'+str(today)+'vibx.csv', index=False)

print(df_android_rw.shape)
df_android_rw.head(1)

#----------------------------------------------------------------------------------------------------------
df_android_rw['month'] = pd.DatetimeIndex(df_android_rw['at']).month
df_android_rw['year'] = pd.DatetimeIndex(df_android_rw['at']).year
df_android_rw['m_y'] = df_android_rw['month'].astype(str)+df_android_rw['year'].astype(str)

#JUST RAW
df_android_used = df_android_rw[['userName','content','at','m_y','score']]
df_android_used.rename(columns = {'userName':'username','content':'review','at':'date','score':'rating'}, inplace=True)

# Convert timetamps to string
df_android_used['date'] = df_android_used['date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))
print(df_android_used.shape)
df_android_used.head(1)


vib1: (4677, 10)
vib2: (1675, 10)
(6352, 11)
(6352, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_android_used.rename(columns = {'userName':'username','content':'review','at':'date','score':'rating'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_android_used['date'] = df_android_used['date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))


Unnamed: 0,username,review,date,m_y,rating
0,Hoài Ly,Vô trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭.𝐂𝟎𝐌 để vay 0% và nhận 790k,2022-09-22 10:33:27,92022,1


**2) iOS**

In [9]:

rw_ios_src_dict = {'vib1':['MyVIB','vn','949371011'],
                   'vib2':['MyVIB 2.0','vn','1626624790'],
                   'tcb':['Techcombank Mobile','vn','1548623362'],
                   'tpb':['TPBank Mobile','vn','450464147'],
                   'bid':['SmartBanking','vn','1061867449'],
                   'vcb':['Vietcombank','vn','561433133'],
                   'mb':['BIZ MBBANK','vn','1424005630']
                  }

#----------------------------------------------------------------------------------------------------------
column = ['title','userName','isEdited','review','date','rating','developerResponse','bankapp']
df_ios_rw = pd.DataFrame(columns = column)

get_review_in = ['vib1','vib2'] #rw_ios_src_dict.keys()

for bankapp in get_review_in:
    iosrw_df = get_rwdata_ios(rw_ios_src_dict, bankapp, save_path=TRAIN_ROOT)  
    iosrw_df['bankapp'] = bankapp
    df_ios_rw = pd.concat([df_ios_rw, iosrw_df])  
    time.sleep(30)
    
today = date.today()
df_ios_rw.reset_index(drop=True, inplace=True)
df_ios_rw.to_csv(TRAIN_ROOT+'/1_iOSRaw_'+str(today)+'vibx.csv', index=False)

#----------------------------------------------------------------------------------------------------------
df_ios_rw['month'] = pd.DatetimeIndex(df_ios_rw['date']).month
df_ios_rw['year'] = pd.DatetimeIndex(df_ios_rw['date']).year
df_ios_rw['m_y'] = df_ios_rw['month'].astype(str)+df_ios_rw['year'].astype(str)

#JUST RAW
df_ios_used = df_ios_rw[['userName','review','date','m_y','rating']]
df_ios_used.rename(columns = {'userName':'username'}, inplace=True)

# Convert timetamps to string
df_ios_used['date'] = df_ios_used['date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))

# Note the latest datetime of review API download to file
file_name = TRAIN_ROOT+'/lastest_review_query.txt'
with open(file_name, 'w') as file:
    file.write(datetime.datetime.now().strftime("%d-%b-%Y (%H:%M:%S.%f)"))   
    
with open(file_name, 'r') as file:
    lastest_review_query = datetime.datetime.strptime(file.read(), "%d-%b-%Y (%H:%M:%S.%f)")
    
lastest_review_query

2022-09-23 13:54:56,027 [INFO] Base - Initialised: AppStore('vn', 'myvib', 949371011)
2022-09-23 13:54:56,029 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/vn/app/myvib/id949371011


vib1: AppStore(country='vn', app_name='MyVIB', app_id='949371011')


2022-09-23 13:55:01,243 [INFO] Base - [id:949371011] Fetched 280 reviews (280 fetched in total)
2022-09-23 13:55:06,914 [INFO] Base - [id:949371011] Fetched 560 reviews (560 fetched in total)
2022-09-23 13:55:12,347 [INFO] Base - [id:949371011] Fetched 860 reviews (860 fetched in total)
2022-09-23 13:55:18,307 [INFO] Base - [id:949371011] Fetched 1120 reviews (1120 fetched in total)
2022-09-23 13:55:20,612 [INFO] Base - [id:949371011] Fetched 1247 reviews (1247 fetched in total)


vib1: (1247, 7)


2022-09-23 13:55:51,292 [INFO] Base - Initialised: AppStore('vn', 'myvib-2-0', 1626624790)
2022-09-23 13:55:51,292 [INFO] Base - Ready to fetch reviews from: https://apps.apple.com/vn/app/myvib-2-0/id1626624790


vib2: AppStore(country='vn', app_name='MyVIB 2.0', app_id='1626624790')


2022-09-23 13:55:56,444 [INFO] Base - [id:1626624790] Fetched 280 reviews (280 fetched in total)
2022-09-23 13:55:58,896 [INFO] Base - [id:1626624790] Fetched 392 reviews (392 fetched in total)


vib2: (392, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ios_used.rename(columns = {'userName':'username'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ios_used['date'] = df_ios_used['date'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S"))


datetime.datetime(2022, 9, 23, 13, 56, 28, 980783)

**3) Combine Android and iOS data**

In [10]:
df_used = pd.concat([df_android_used, df_ios_used])
df_used['duplicate'] = df_used.duplicated(subset=['username', 'm_y', 'review'], keep='first')
print('BEFORE: df_used.shape = ',df_used.shape[0])
df_used = df_used[df_used['duplicate']==False] #remove duplicate
df_used.reset_index(inplace=True, drop=True)
print('AFTER DUPLICATE PROCESSING: df_used.shape = ',df_used.shape[0])
# process for date column
df_used['date'] = df_used['date'].apply(lambda x: ('20'+x) if (len(x)==17) else x ) # add '20' before datetime string for right format yyyy-mm-dd 
df_used['date'] = df_used['date'].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S'))

df_used.to_csv(TRAIN_ROOT+'/2_dataApp_combine.csv',index=False,sep='\t')


BEFORE: df_used.shape =  7991
AFTER DUPLICATE PROCESSING: df_used.shape =  7742


#### Text Processing 
**1) Vietnamese - Typo correction**

In [11]:
# remove null review
df_used = df_used[~(df_used['review'].isnull()|df_used['review'].isna())]

# typo correction by looking up dict
df_used['review_adj'] = df_used['review'].apply(lambda x: review_vietnamese_adj(x.lower()))
df_used['ymdhms'] = pd.to_datetime(df_used['date']).dt.strftime("%Y_%m_%d_%H_%M_%S")
df_used.to_csv(TRAIN_ROOT+'/3_dataApp_typo_corrected_'+str(today)+'.csv',encoding='utf-16',index=False,sep='\t')

# Count Vietnamese negative words in the sentence
df_used['negative_word'] = df_used['review_adj'].apply(lambda x: review_vietnamese_match_wordlist(x, global_materials.negative_word))
df_used['positive_word'] = df_used['review_adj'].apply(lambda x: review_vietnamese_match_wordlist(x, global_materials.positive_word))

#df_used = pd.read_csv('df_used_vn_correct_bf_trans.csv',index_col=False,sep='\t')
print(df_used.shape)
df_used.head(1)


vô trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭 để vay và nhận
vô ktien vn để nhận
vô trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭 để vay và nhận
app sử dụng rất khó chịu thanh toán hóa đơn tháng nào cũng gặp vấn đề nhấn thanh toán quay liên tục xong báo lỗi trong khi wifi căng đét
vô địa chỉ ktien vn để kiếm triệu nha
tôi mở thẻ tín dụng mà vào chụp ảnh chứng minh nhân dân mãi không được quá kém
vào trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭 vay và free
lưu ý vô trang 𝑽𝒂𝒚𝑻𝒐𝒕𝑵𝒉𝒂𝒕 để vay và nhận
vào đúng trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭 vay và free
chán quá không tải được app về
vô ktien vn để vay và nhận free
tốt nhất cho các bạn trẻ đang làm việc trong môi trường
muốn nhận triệu thì truy cập ktien vn
cty may thiên sơn xã tam đa huyện phù cừ tỉnh hưng yên dữ lương không chi trả cho tôi
tài khoản số dư không thấy mở thẻ
ứng dụng kém chức năng chuyển tiền quá chậm bây giờ mà vẫn còn lệnh chuyển nhanh hay chậm mới ghê trong khi các ngân hàng khác đã chuyển nhanh hết chán phèo gọi tổng đài để xúc tiến giao dịch nhanh lên dùm thì bảo không được bó tay thời đại rồi nếu không thay đổi thì

Unnamed: 0,username,review,date,m_y,rating,duplicate,review_adj,ymdhms,negative_word
0,Hoài Ly,Vô trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭.𝐂𝟎𝐌 để vay 0% và nhận 790k,2022-09-22 10:33:27,92022,1,False,vô trang 𝐕𝐚𝐲𝐓𝐨𝐭𝐍𝐡𝐚𝐭 để vay và nhận,2022_09_22_10_33_27,0


**2) Vietnamese to English translation**

In [14]:
# Run Google Translation
filenameF = TRAIN_ROOT+'/4_dataApp_EN_'+str(today)+'.csv'

response = False
cur_i = 0
#df_trans = pd.DataFrame()
if not os.path.exists(filenameF):
    while (response==False):
        cur_i, response = google_translate(df_used,TRAIN_ROOT+'/translate_'+str(today), cur_i, 1000)
        
    df_train = pd.DataFrame()
    for fl in os.listdir(TRAIN_ROOT+'/translate_'+str(today)): 
        df_temp = pd.read_csv(TRAIN_ROOT+'/translate_'+str(today)+'/'+fl, index_col=False, sep='\t')
        df_train = pd.concat([df_train,df_temp])
    print(df_train.shape)
    df_train.to_csv(filenameF, index=False, sep='\t')
    
else:
    print(f'{filenameF} is Existed! Please check if you want to retranslate.')
#del df_used

Content: out length:7740, in length: 7740


**3) English - Clean text**

In [28]:
df_train = pd.read_csv(TRAIN_ROOT+'/4_dataApp_EN_2022-09-23.csv', index_col=False, sep='\t')

# df_train = pd.DataFrame()
# for fl in os.listdir(temp_dir): 
#     df_temp = pd.read_csv(temp_dir+'/'+fl, index_col=False, sep='\t')
#     df_train = pd.concat([df_train,df_temp])
# print(df_train.shape)

# Clean English 
df_train = review_english_clean(df_train)


In [29]:
df_train['negative_word'] = df_train['review_adj'].apply(lambda x: review_vietnamese_match_wordlist(x, global_materials.negative_word))
df_train['positive_word'] = df_train['review_adj'].apply(lambda x: review_vietnamese_match_wordlist(x, global_materials.positive_word))

#### EMBEDDING METHODS

**1) VADER**

In [30]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiment = SentimentIntensityAnalyzer()
#sentiment.polarity_scores('I love VIB')

df_train['vader_pos']      = df_train['review_clean'].apply(lambda x: sentiment.polarity_scores(x)['pos'])
df_train['vader_neg']      = df_train['review_clean'].apply(lambda x: sentiment.polarity_scores(x)['neg'])
df_train['vader_neu']      = df_train['review_clean'].apply(lambda x: sentiment.polarity_scores(x)['neu'])
df_train['vader_compound'] = df_train['review_clean'].apply(lambda x: sentiment.polarity_scores(x)['compound'])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nhan.ngothanh1\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [34]:
# Labeling
df_train['label_rating']  = df_train['rating'].apply(lambda x: 1 if x<=3 else 0)
df_train['flag_posword']  = df_train['positive_word'].apply(lambda x: 1 if x>=1 else 0)

df_train['label_negword_pre'] = df_train['negative_word'].apply(lambda x: 1 if x>=1 else 0)
df_train['label_negword'] = np.where(df_train['label_negword_pre']>df_train['flag_posword'],1,0)

df_train['label_vader']   = df_train['vader_compound'].apply(lambda x: 1 if x<-0.05 else 0)

df_train['label_sum']     = df_train[['label_rating','label_negword','label_vader']].sum(axis=1)
df_train['label']         = df_train['label_sum'].apply(lambda x: 1 if x>=2 else 0)

print(df_train.shape)
df_train[['review','review_adj','review_en','review_clean','label_negword_pre','flag_posword','label_rating','label_negword','label_vader','label_sum','label']].to_csv(TRAIN_ROOT+'/6_dataApp_lable_check.csv', encoding='utf-16', sep='\t', index=False)

(6414, 23)


In [35]:
df_train.to_csv(TRAIN_ROOT+'/5_dataApp_EN_clean_vader.csv', index=False)
df_train.shape

(6414, 23)

In [36]:
df_train = pd.read_csv(TRAIN_ROOT+'/5_dataApp_EN_clean_vader.csv', index_col=False)
#df_train[['review','label_rating','label_negword','label_vader','label']].to_csv(TRAIN_ROOT+'/6_dataApp_lable_check.csv', encoding='utf-16', sep='\t', index=False)

In [44]:
df_train['label'].value_counts()

0    3882
1    2532
Name: label, dtype: int64

#### 2. TF-IDF Vertorizer

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df_train['review_clean'].values
#vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=3, max_features = 1000, ngram_range=(1,5))
vectorizer = TfidfVectorizer(stop_words=stopwords, max_df=0.7, min_df=3, max_features = 1000, ngram_range=(1,5))
X = vectorizer.fit_transform(corpus)
#print(vectorizer.get_feature_names_out())
#print(X)

pickle.dump(vectorizer, open("./model/TFIDF_VECTORIZER_app.pk", "wb"))

col_list = [i.lstrip() for i in vectorizer.get_feature_names_out()]
col_list = np.array(col_list)

df_tfidf = pd.DataFrame(data=X.toarray(), columns = col_list)
df_tfidf['vader_compound'] = df_train['vader_compound'].values
df_tfidf['vader_compound'] = df_tfidf['vader_compound'].apply(lambda x: 1 if x<-0.05 else 0) # VADER < 0.05: negative (instead of -0.05)
df_tfidf['label_negword'] = df_train['label_negword'].values
#df_tfidf = df_tfidf[(df_tfidf['vader_compound']<-0.05)|(df_tfidf['vader_compound']>0.05)]
#df_tfidf['vader_compound'] = df_tfidf['vader_compound'].apply(lambda x: 0 if x<-0.05 else 1 if x>0.05 else 0.5 )

df_tfidf['label'] = df_train['label'].values
df_tfidf.head()


Unnamed: 0,able,accept,access,accord,account,account balance,account lock,account log,account not,account number,...,write,wrong,wrong password,year,yesterday,yet,young,vader_compound,label_negword,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1


#### MODELING 
**0) Train Test dataset**

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

FEA_COL = [col for col in df_tfidf.columns.values if col not in ['label']]
print(f'FEA_COL:{len(FEA_COL)}')

X = df_tfidf[FEA_COL].values
y = df_tfidf['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

FEA_COL:1002


**1) Multinormial Naive Bayes**

In [41]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf = clf.fit(X_train, y_train)

# Result of model with tfidf
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

# Fit with full data and save model
clf = clf.fit(X, y)
pickle.dump(clf, open("./model/SENTIMENT_MODEL_mnb_app.sav", "wb"))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95       777
           1       0.87      0.98      0.92       506

    accuracy                           0.94      1283
   macro avg       0.93      0.94      0.94      1283
weighted avg       0.94      0.94      0.94      1283



**2) Random Forest**

In [42]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=8, min_samples_split = 10, min_samples_leaf=2, max_features='sqrt',random_state=42)
clf.fit(X_train, y_train)

# Result of model with tfidf
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

# Fit with full data and save model
clf = clf.fit(X, y)
pickle.dump(clf, open("./model/SENTIMENT_MODEL_rf_app.sav", "wb"))


              precision    recall  f1-score   support

           0       0.92      0.95      0.94       777
           1       0.92      0.88      0.90       506

    accuracy                           0.92      1283
   macro avg       0.92      0.91      0.92      1283
weighted avg       0.92      0.92      0.92      1283



**3) SVC**

In [43]:
from sklearn.svm import SVC

#clf = SVC(C=0.3, gamma='auto', random_state=42)
clf = SVC(C=3, gamma=1, random_state=42)
clf.fit(X_train, y_train)

# Result of model with tfidf
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

# Fit with full data and save model
clf = clf.fit(X, y)
pickle.dump(clf, open("./model/SENTIMENT_MODEL_svc_app.sav", "wb"))


              precision    recall  f1-score   support

           0       0.98      0.95      0.96       777
           1       0.93      0.96      0.94       506

    accuracy                           0.96      1283
   macro avg       0.95      0.96      0.95      1283
weighted avg       0.96      0.96      0.96      1283



##############################################################
#### ---> RESULT: Choosing SVC Model
##############################################################

#### END ------------------------------------------------------------------------------------------------

#### PREDICTION 

In [27]:
# Load REVIEW OF VIB and CHECK SENTIMENT
today = date.today()
column = ['reviewId','userName','userImage','content','score','thumbsUpCount','reviewCreatedVersion','at','replyContent','repliedAt','bankapp']
df_android_rw_pred = pd.DataFrame(columns = column)

for bank_name in ['vib1','vib2']:
    Andrw_df = get_rwdata_android(rw_android_src_dict, bank_name)   
    Andrw_df['bankapp'] = bank_name
    df_android_rw_pred = pd.concat([df_android_rw_pred, Andrw_df])
    time.sleep(30)
    
df_android_rw_pred.reset_index(drop=True, inplace=True)

reviews = df_android_rw_pred['content'].values

vib1: (4676, 10)
vib2: (1673, 10)


In [30]:
reviews = df_android_rw_pred['content'].values
reviews = reviews[100:130] 

In [22]:
reviews = ['Tôi thấy app của VIB cũng được. Xài cũng ok, nói chung ổn.',
           'dịch vụ chán, lỗi hoài',
           'cũng tạm ah, hôm được hôm không',
           'có cải thiện đó, chạy mượt hơn',
           'hài lòng với dịch vụ ở đây',
           'không thấy gì để chê với phí của vib',
           'tệ quá trời',
           'nhân viên chậm lắm',
           'hỗ trợ lâu không hài lòng',
           'càng nhiều kỷ niệm thì chia tay càng đau đớn',
           'tiếc cho họ nhìn mặt có nét phu thê quá trời',
           'tao còn mới thấy page nào đó bê cả câu bà của chúng tôi luôn cơ đấy',
           'chúng đã sai khi năm xưa mỉa mai chị đấy',
           'hóa ra năm xưa em đã trách lầm chị rồi',
           'bỏ tiền tỷ đầu tư này nọ xong giờ làm xấu mặt']

#reviews = ['hỗ trợ lâu không hài lòng']

df_pred = pd.DataFrame(data=reviews, columns=['review'])
# Vietnamese processing
df_pred['review_adj'] = df_pred['review'].apply(lambda x: review_vietnamese_adj(x).lower())

df_pred['negative_word'] = df_pred['review_adj'].apply(lambda x: review_vietnamese_match_wordlist(x, global_materials.negative_word))
df_pred['label_negword'] = df_pred['negative_word'].apply(lambda x: 1 if x>0 else 0)
df_pred['positive_word'] = df_pred['review_adj'].apply(lambda x: review_vietnamese_match_wordlist(x, global_materials.positive_word))
df_pred['flag_posword']  = df_pred['positive_word'].apply(lambda x: 1 if x>0 else 0)
                                                          
# English translation
df_pred['review_en'] = df_pred['review_adj'].apply(lambda x: GoogleTranslator(source = 'vi', target = 'en').translate(x))
if 'username' not in df_pred.columns.values:
    df_pred['username'] = np.zeros(df_pred.shape[0])
    
df_pred = review_english_clean(df_pred)

df_pred['vader_compound'] = df_pred['review_clean'].apply(lambda x: sentiment.polarity_scores(x)['compound'])
#df_pred['vader_compound'] = df_pred['vader_compound'].apply(lambda x: 0 if x<-0.05 else 1 if x>0.05 else 0.5 )
df_pred['vader_compound'] = df_pred['vader_compound'].apply(lambda x: 1 if x<-0.05 else 0)

tfidf_vectorizer = pickle.load(open('./model/TFIDF_VECTORIZER_app.pk','rb'))
#tfidf_vectorizer.get_feature_names_out()

df_tfidfx = pd.DataFrame(data=tfidf_vectorizer.transform(df_pred['review_clean'].values).toarray(), columns=col_list)
df_tfidfx['vader_compound'] = df_pred['vader_compound'].values
df_tfidfx['label_negword'] = df_pred['label_negword'].values
df_tfidfx['flag_posword'] = df_pred['flag_posword'].values
#df_pred = pd.concat([df_pred, df_tfidfx],axis=1,ignore_index=True)

# Load model
sentiment_MODEL = pickle.load(open('./model/SENTIMENT_MODEL_svc_app.sav','rb'))

y_pred = sentiment_MODEL.predict(df_tfidfx[FEA_COL].values)
df_pred['sentiment'] = y_pred
df_pred[['review','review_en','review_clean','vader_compound','sentiment']]


Unnamed: 0,review,review_en,review_clean,vader_compound,sentiment
0,"Tôi thấy app của VIB cũng được. Xài cũng ok, n...",I think vib's app is also okay to use in general,think okay use general,0,0
1,"dịch vụ chán, lỗi hoài",boring service forever,bore service forever,1,1
2,"cũng tạm ah, hôm được hôm không",Good day too,good day,0,0
3,"có cải thiện đó, chạy mượt hơn","There is an improvement, it runs smoother",improvement run smoother,0,0
4,hài lòng với dịch vụ ở đây,Satisfied with the service here,satisfied service,0,0
5,không thấy gì để chê với phí của vib,I don't see anything to complain about with vi...,do not see anything complain fee,0,1
6,tệ quá trời,so bad,bad,1,1
7,nhân viên chậm lắm,very slow staff,slow staff,0,1
8,hỗ trợ lâu không hài lòng,unsatisfied long support,unsatisfied long support,0,1
9,càng nhiều kỷ niệm thì chia tay càng đau đớn,"The more memories, the more painful parting is",memory painful part,1,1


In [24]:
avg_sentiment_score = df_pred['sentiment'].sum()/df_pred.shape[0]
sentiment_score = -(avg_sentiment_score-0.5)
print(f"Neg/Pos ratio: {df_pred['sentiment'].sum()}/{df_pred.shape[0]-df_pred['sentiment'].sum()}")
print(f"Sentiment Score: {sentiment_score:.5f}")

Neg/Pos ratio: 8/7
Sentiment Score: -0.03333


##### END SENTIMENT PREDICTION
####################################################################################