In [92]:
import re
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [93]:
data_dir = 'data/aclImdb'
data = {}
for split in ["train", "test"]:
    data[split] = []
    for sentiment in ["neg", "pos"]:
        score = 1 if sentiment == "pos" else 0
        path = os.path.join(data_dir, split, sentiment)
        file_names = os.listdir(path)
        for f_name in file_names:
            with open(os.path.join(path, f_name), "r", encoding="utf-8") as f:
                review = f.read()
                data[split].append([review, score])

# Shuffle the training data
np.random.seed(42)
np.random.shuffle(data["train"])
data["train"] = pd.DataFrame(data["train"], columns=["text", "sentiment"])

# Shuffle the test data
np.random.seed(42)
np.random.shuffle(data["test"])
data["test"] = pd.DataFrame(data["test"], columns=["text", "sentiment"])

In [94]:
data["train"].head()

Unnamed: 0,text,sentiment
0,This movie is another Christian propaganda fil...,0
1,A woman who hates cats (Alice Krige) and her s...,1
2,"Beast Wars is a show that is over-hyped, overp...",0
3,"An excellent example of ""cowboy noir"", as it's...",1
4,"Ok, basically this is a popcorn sci-fi movie, ...",1


In [95]:
data["test"].head()

Unnamed: 0,text,sentiment
0,"Now, I loved ""Lethal Weapon"" and ""Kiss Kiss Ba...",0
1,"First of all, I should point out that I really...",1
2,It's been said that some directors make small ...,0
3,"""The Seven-Ups"" seems like a replay of ""The Fr...",1
4,This timeless summer love story is a classic a...,1


In [96]:
train_data = data["train"]
test_data = data["test"]
train_data.head()

Unnamed: 0,text,sentiment
0,This movie is another Christian propaganda fil...,0
1,A woman who hates cats (Alice Krige) and her s...,1
2,"Beast Wars is a show that is over-hyped, overp...",0
3,"An excellent example of ""cowboy noir"", as it's...",1
4,"Ok, basically this is a popcorn sci-fi movie, ...",1


In [97]:
train_data.shape

(25000, 2)

In [98]:
test_data.shape

(25000, 2)

In [99]:
# Test clean text
def clean_text(text):
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    #print('html tags removed: ', text)
    # remove [\], ['] and ["]
    text = re.sub(r'[\[\]\'\"]', '', text)
    #print('[\], [], [\'] removed: ', text)
    text = text.strip().lower()
    # thay dấu chấm bằng dấu cách
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    return text


clean_text("<html>This is not '\ [] \" a sentence.<\html>").split()

['this', 'is', 'not', 'a', 'sentence']

Biểu diễn bình luận dưới dạng vector

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

# Exammple for testing
training_texts = [
    "This is a good cat cat",
    "This is a bad day"
]

test_texts = [
    "This day is a good day"
]

In [101]:
# Create the vectorizer
# stop_words: là những từ mà được build sẵn trong thư viện sklearn bao gồm những từ không có ý nghĩa như: is, a, an, the...
vectorizer = CountVectorizer(stop_words="english", preprocessor=clean_text)
vectorizer.fit(training_texts)
vectorizer

CountVectorizer(preprocessor=<function clean_text at 0x1552d5b80>,
                stop_words='english')

In [102]:
vectorizer.vocabulary_

{'good': 3, 'cat': 1, 'bad': 0, 'day': 2}

In [103]:
vectorizer = CountVectorizer(stop_words="english", preprocessor=clean_text)
test_train_feature = vectorizer.fit_transform(training_texts)
vectorizer.vocabulary_

{'good': 3, 'cat': 1, 'bad': 0, 'day': 2}

In [104]:
type(test_train_feature)

scipy.sparse._csr.csr_matrix

In [105]:
test_train_feature.todense()

matrix([[0, 2, 0, 1],
        [1, 0, 1, 0]])

In [106]:
inv_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
inv_vocab

{3: 'good', 1: 'cat', 0: 'bad', 2: 'day'}

In [107]:
vocabulary = [inv_vocab[i] for i in range(len(inv_vocab))]
vocabulary

['bad', 'cat', 'day', 'good']

In [108]:
list(vectorizer.vocabulary_.keys())

['good', 'cat', 'bad', 'day']

### Tạo CountVectorizer
Vectorier được dùng để vectơ hóa các bình luận.

In [109]:
vectorizer = CountVectorizer(stop_words="english", preprocessor=clean_text)

### Biến đổi bình luận thành vectơ

In [110]:
train_data.head()

Unnamed: 0,text,sentiment
0,This movie is another Christian propaganda fil...,0
1,A woman who hates cats (Alice Krige) and her s...,1
2,"Beast Wars is a show that is over-hyped, overp...",0
3,"An excellent example of ""cowboy noir"", as it's...",1
4,"Ok, basically this is a popcorn sci-fi movie, ...",1


In [111]:
test_data.head()

Unnamed: 0,text,sentiment
0,"Now, I loved ""Lethal Weapon"" and ""Kiss Kiss Ba...",0
1,"First of all, I should point out that I really...",1
2,It's been said that some directors make small ...,0
3,"""The Seven-Ups"" seems like a replay of ""The Fr...",1
4,This timeless summer love story is a classic a...,1


In [112]:
test_data.shape

(25000, 2)

In [113]:
training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data["text"])

In [114]:
training_features.shape

(25000, 80000)

### Tạo và huấn luyện mô hình

In [115]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(training_features, train_data["sentiment"])



LinearSVC()

### Đánh giá mô hình

In [116]:
from sklearn.metrics import accuracy_score

y_test_pred = model.predict(test_features)

In [117]:
acc = accuracy_score(test_data["sentiment"], y_test_pred)
print("Accuracy on the IMDB dataset: {:.2f}%".format(acc*100))

Accuracy on the IMDB dataset: 83.69%


### (Tùy chọn) Đánh giá mô hình sử dụng f1 score

In [118]:
from sklearn.metrics import f1_score
f1 = f1_score(test_data["sentiment"], y_test_pred, average="macro")
print("F1 on the IMDB dataset: {:.2f}".format(f1))

F1 on the IMDB dataset: 0.84


### Cải thiện mô hình

Dùng [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) với [n-gram](https://en.wikipedia.org/wiki/N-gram)

In [120]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2), preprocessor=clean_text, stop_words="english") # n-gram (1,2) là 1-gram và 2-gram (1,2)
training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data["text"])

model = LinearSVC()
model.fit(training_features, train_data["sentiment"])

y_test_pred = model.predict(test_features)
acc = accuracy_score(test_data["sentiment"], y_test_pred)
print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 88.65


In [121]:
from sklearn.metrics import f1_score
f1 = f1_score(test_data["sentiment"], y_test_pred, average="macro")
print("F1 on the IMDB dataset: {:.2f}".format(f1))

F1 on the IMDB dataset: 0.89
