In [4]:
# 2 pyvi 
# Libraries
import pandas as pd 
import numpy as np
import pickle

# load training data
df_train=pd.read_csv("training_data_sentiment.csv")
train=df_train.copy()

# encode target column: N=0, Y=1
train["is_unsatisfied"]=train["is_unsatisfied"].replace({"Y": 1, "N":0})

# upsampling
from sklearn.utils import resample
train_majority=train[train["is_unsatisfied"]==0]
train_minority=train[train["is_unsatisfied"]==1]

train_minority_upsampled=resample(train_minority, replace=True, n_samples=len(train_majority)//2, random_state=1)
train_upsampled=pd.concat([train_majority,train_minority_upsampled])


# cleaning (stop words and special characters removal)
def  clean_text(df, text_field):
    import re
    file=open("vietnamese-stopwords.txt")
    stp=file.read().split("\n")
    pat = r'\b(?:{})\b'.format('|'.join(stp))
    for i in text_field:
        df[i] = df[i].apply(lambda elem: re.sub(r"[\|]", "", elem))  
        df[i] = df[i].str.replace(pat, '')
    return df
clean_text(train_upsampled, ["question", "answer"])

# POS tagging: extracting verb adjective and noun words
def pos(df, text_field, col):
    from pyvi import ViTokenizer, ViPosTagger
    m=[]
    for i in range(len(df)):
        q_pos=ViPosTagger.postagging(ViTokenizer.tokenize(df[text_field].iloc[i]))
        dict_q_pos=dict(zip(q_pos[0], q_pos[1]))
        lst=[x for x, y in dict_q_pos.items() if y in ["N", "Ny", "Np", "Nu", "Nc", "A", "V"]]
        l=[["".join(x) for x in lst]]
        m.append([" ".join(x) for x in l])
    return pd.DataFrame(m, columns=[col])
train_upsampled["q_pos"]=pos(train_upsampled,"question", "q_pos")
train_upsampled["a_pos"]=pos(train_upsampled,"answer", "a_pos")


# training 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.linear_model import SGDClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
from xgboost import XGBClassifier

# pipeline for vectorization and classification
pipeline_xgb = Pipeline([ 
    ('vect', TfidfVectorizer()),
    (('classifier', XGBClassifier()))
])

labels=train_upsampled['is_unsatisfied']
train_upsampled=train_upsampled.drop('is_unsatisfied', 1)

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['q_pos']+train_upsampled['a_pos'], labels, random_state = 1)

score = 'roc_auc'
param = {"classifier__max_depth": list(range(1,20))}

pipeline_xgb.fit(X_train, y_train)
from sklearn.model_selection import RandomizedSearchCV
gsearch = RandomizedSearchCV(estimator =pipeline_xgb, param_distributions=param , scoring= score)
model=gsearch.fit(X_train, y_train)
pickle.dump(model, open('xgb_model.pckl', 'wb'))

# validation
xgb_model = pickle.load(open('xgb_model.pckl', 'rb'))
y_predict = xgb_model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(accuracy_score(y_test, y_predict))


[[4532   11]
 [   0 2228]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4543
           1       1.00      1.00      1.00      2228

    accuracy                           1.00      6771
   macro avg       1.00      1.00      1.00      6771
weighted avg       1.00      1.00      1.00      6771

0.9983754246049328
