In [1]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import Perceptron

# Read data

In [2]:
import pandas as pd

def readData():
    dftrain = pd.read_excel('data/dataTrain.xlsx', sheet_name='dataTrain')
    dftest = pd.read_excel('data/dataTest.xlsx', sheet_name='dataTest')
    comTrain = dftrain['comment'].values.tolist()
    y_train= dftrain['label'].values.tolist()
    comTest = dftest['comment'].values.tolist()
    y_test = dftest['label'].values.tolist()
    return [comTrain, y_train, comTest, y_test]

In [3]:
import re
import pyvi
from pyvi import ViTokenizer 

def preprocess_reviews(reviews):
    REPLACE_NO_SPACE = re.compile("[.;:\',\"()\[\]]")#("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    reviews = [REPLACE_NO_SPACE.sub("", str(line).lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    reviews = [line.lstrip() for line in reviews]
    reviews = [ViTokenizer.tokenize(rev) for rev in reviews]
    return reviews


# Vectorization

In [4]:

def vectorizer(comTrain,comTest):
    cv = CountVectorizer(binary=True)
    cv.fit(comTrain)
    X_train = cv.transform(comTrain)
    X_test = cv.transform(comTest)
    return [X_train,X_test]

# Training with models

In [5]:
def SVM():
    clf = SVC(kernel = 'linear', C = 1e5)
    clf.fit(X_train.toarray(), y_train) 
    w = clf.coef_
    b = clf.intercept_
    print ("Final Accuracy SVM: %s"% accuracy_score(y_test, clf.predict(X_test.toarray())))

In [6]:
def multiNB():
    clf = MultinomialNB()
    clf.fit(X_train, y_train)

    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)

    acc_test = accuracy_score(y_pred_test,y_test)
    acc_train = accuracy_score(y_pred_train,y_train)
    print('Final Accuracy multi Naive Bayes: ', acc_test)

In [7]:
def perceptron():
    clf = Perceptron(tol=1e-3, random_state=0)
    clf.fit(X_train, y_train)
    acc_train=clf.score(X_train, y_train)
    acc_test =clf.score(X_test, y_test)
    print('Final Accuracy perceptron: ', acc_test)

In [8]:

def logistic():
#     for c in [0.01, 0.05, 0.25, 0.5, 1]:
#         lr = LogisticRegression(C=c)
#         lr.fit(X_trainlog, y_trainlog)
#         print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_testlog, lr.predict(X_testlog))))

    final_model = LogisticRegression(C=0.5)
    final_model.fit(X_train, y_train)
    print ("Final Accuracy logistic: %s"% accuracy_score(y_test, final_model.predict(X_test)))

In [13]:
[comTrain, y_train, comTest, y_test] = readData()
comTrain = preprocess_reviews(comTrain)
comTest = preprocess_reviews(comTest)
[X_train,X_test] = vectorizer(comTrain,comTest)



In [14]:
multiNB()
logistic()
perceptron()
SVM()

Final Accuracy multi Naive Bayes:  0.8498727735368957




Final Accuracy logistic: 0.8829516539440203
Final Accuracy perceptron:  0.816793893129771
Final Accuracy SVM: 0.8498727735368957


# Predict a sentence

In [17]:
def multiNBPre(comment):
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred_test = clf.predict(comment)
    return y_pred_test[0]

def logisticPre(comment):
    clf = LogisticRegression(C=0.5)
    clf.fit(X_train, y_train)
    y_pred_test = clf.predict(comment)
    return y_pred_test[0]

def predict(comment):
    [a,check] = vectorizer(comTrain,[comment])
    return logisticPre(check)
predict('máy tốt')



1

# Web api

In [22]:
from flask import Flask,url_for,request,redirect,render_template
app = Flask(__name__)

class User:
    def __init__():
        pass
    def __init__(self, id, username, password):
        self.id = id
        self.username = username
        self.password = password

    def __repr__(self):
        return f'<User: {self.username}>'

users = []
users.append(User(id=1, username='admin1', password='000000'))
users.append(User(id=2, username='admin2', password='123456'))


@app.route('/')
def welcome():
    return redirect('/login')

@app.route('/home',methods=['GET','POST'])
def home():
    label = -1
    err = ""
    if request.method == 'POST':
        label = predict(request.form['comment'])
    return render_template('home.html',result = label, error = err)

@app.route('/login',methods =['GET','POST'])
def login():
    error = ''
    if request.method =='POST':
        username = request.form['username']
        password = request.form['password']
        error = ' The username and password you entered did not match our records. Please double-check and try again.'
        for x in users:
            if x.username == username:
                if x.password != password:
                    error = ' Account is not defined'
                    break;
                else:
                    return redirect(url_for('home'))
    return render_template('login.html',error=error)
if __name__=='__main__':
    app.run(host='localhost', port=7500, debug=False)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://localhost:7500/ (Press CTRL+C to quit)
127.0.0.1 - - [09/May/2020 18:00:36] "GET / HTTP/1.1" 302 -
127.0.0.1 - - [09/May/2020 18:00:36] "GET /login HTTP/1.1" 200 -
127.0.0.1 - - [09/May/2020 18:00:41] "POST /login HTTP/1.1" 302 -
127.0.0.1 - - [09/May/2020 18:00:41] "GET /home HTTP/1.1" 200 -
127.0.0.1 - - [09/May/2020 18:00:50] "POST /home HTTP/1.1" 200 -
127.0.0.1 - - [09/May/2020 18:00:57] "POST /home HTTP/1.1" 200 -
