In [1]:
import os
import pandas as pd
import tensorflow as tf

In [2]:
def load_data(dir):
    data = list()
    for folder in os.listdir(dir):
        for file in os.listdir(os.path.join(dir, folder)):
            with open(os.path.join(dir, folder, file), 'r', encoding="utf16") as text:
                words = text.read()
                data.append([words, folder])
    df = pd.DataFrame(data, columns = ['text', 'label'])
    return df

In [3]:
df = load_data('./dataset/train')
df_test = load_data('./dataset/test')

In [4]:
summaries = df['text'].values

In [6]:
def no_accent_vietnamese(s):
    s = re.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = re.sub(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = re.sub(r'[èéẹẻẽêềếệểễ]', 'e', s)
    s = re.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = re.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = re.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = re.sub(r'[ìíịỉĩ]', 'i', s)
    s = re.sub(r'[ÌÍỊỈĨ]', 'I', s)
    s = re.sub(r'[ùúụủũưừứựửữ]', 'u', s)
    s = re.sub(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = re.sub(r'[ỳýỵỷỹ]', 'y', s)
    s = re.sub(r'[ỲÝỴỶỸ]', 'Y', s)
    s = re.sub(r'[Đ]', 'D', s)
    s = re.sub(r'[đ]', 'd', s)
    return s

In [7]:
import re
from multiprocessing import Pool

def text_cleaner(summary):
    a_text = no_accent_vietnamese(summary)
    a_text = a_text.lower()
    a_text = a_text.replace('\n', ' ')
    a_text = re.sub(r"[^a-zA-Z0-9 ]","",a_text)
    # a_text = re.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_0-9]', ' ', a_text)
    a_text = re.sub(r'\s+', ' ', a_text).strip()
    return a_text

p = Pool(4)
copus = p.map(text_cleaner, summaries)

In [9]:
label = df['label'].values

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [11]:
X_train, X_test, y_train, y_test = train_test_split(copus, label, test_size=0.2, random_state=42)

In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print(list(label_encoder.classes_), '\n')
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

['Am nhac', 'Am thuc', 'Bat dong san', 'Bong da', 'Chung khoan', 'Cum ga', 'Cuoc song do day', 'Du hoc', 'Du lich', 'Duong vao WTO', 'Gia dinh', 'Giai tri tin hoc', 'Giao duc', 'Gioi tinh', 'Hackers va Virus', 'Hinh su', 'Khong gian song', 'Kinh doanh quoc te', 'Lam dep', 'Loi song', 'Mua sam', 'My thuat', 'San khau dien anh', 'San pham tin hoc moi', 'Tennis', 'The gioi tre', 'Thoi trang'] 



In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [18]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1),
                                             max_df=0.8,
                                             max_features=None)), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', RandomForestClassifier(random_state=0))
                    ])
text_clf = text_clf.fit(X_train, y_train)
print("Done")

Done


In [19]:
y_pred = text_clf.predict(X_test)

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=list(label_encoder.classes_)))

                      precision    recall  f1-score   support

             Am nhac       0.67      0.96      0.79       177
             Am thuc       0.95      0.77      0.85        47
        Bat dong san       0.95      0.88      0.91        42
             Bong da       0.87      1.00      0.93       363
         Chung khoan       0.97      0.93      0.95        69
              Cum ga       0.94      0.98      0.96        89
    Cuoc song do day       0.58      0.64      0.61       146
              Du hoc       0.83      0.92      0.87       142
             Du lich       0.77      0.70      0.73       130
       Duong vao WTO       0.88      0.76      0.82        38
            Gia dinh       0.83      0.14      0.24        35
    Giai tri tin hoc       0.73      0.62      0.67       177
            Giao duc       0.71      0.83      0.76       182
           Gioi tinh       0.88      0.64      0.74        66
    Hackers va Virus       0.89      0.81      0.85        79
       

In [21]:
df_test.head()

Unnamed: 0,text,label
0,Trại cai nghiện 18 m2\nTrong lúc Hà Tĩnh chưa ...,Loi song
1,Nghệ thuật tạo thiện cảm trong giao tiếp\nBạn...,Loi song
2,Có khi nào anh nhớ đến em!\nAi bảo có người v...,Loi song
3,Ông Tám 'Khùng'\nÔng Tám Hiệu (thị xã Long Khá...,Loi song
4,"Những buổi hẹn hò, những cuộc chơi đêm của các...",Loi song


In [23]:
copus_test = p.map(text_cleaner, df_test['text'].values)

In [24]:
label_test = label_encoder.transform(df_test['label'])

In [25]:
label_pred = text_clf.predict(copus_test)

In [31]:
print(classification_report(label_test, label_pred, target_names=list(label_encoder.classes_)))

                      precision    recall  f1-score   support

             Am nhac       0.63      0.90      0.74       813
             Am thuc       0.96      0.80      0.87       400
        Bat dong san       0.96      0.73      0.83       282
             Bong da       0.85      1.00      0.92      1464
         Chung khoan       0.92      0.93      0.92       320
              Cum ga       0.92      0.99      0.95       381
    Cuoc song do day       0.37      0.51      0.43       405
              Du hoc       0.83      0.91      0.87       394
             Du lich       0.84      0.76      0.80       565
       Duong vao WTO       0.97      0.58      0.73       191
            Gia dinh       0.93      0.14      0.24       280
    Giai tri tin hoc       0.55      0.61      0.58       707
            Giao duc       0.70      0.89      0.78       707
           Gioi tinh       0.77      0.51      0.61       268
    Hackers va Virus       0.93      0.75      0.83       319
       