In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import re

In [3]:
filepath = '/home/anhduc/Downloads/Telegram Desktop/traindatatopic.txt'

In [4]:
f = open(filepath,encoding='utf8')

In [5]:
str = f.read()

# Preprocessing

In [6]:
label = re.findall(r'<label>(.*?)</label>', str)
content = re.findall(r'<content>\s*((?:.|\n)*?)</content>', str)
print(len(label))
print(len(content))

4965
4965


In [7]:
for i in range(len(content)):
    content[i] = re.sub(r'[^\w\s]','',content[i])

In [8]:
from underthesea import word_tokenize

In [9]:
for i in range(len(content)):
    content[i] = word_tokenize(content[i], format="text")

In [10]:
import pandas as pd

In [11]:
df = pd.DataFrame()
df['content'] = content
df['label'] = label

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4965 entries, 0 to 4964
Data columns (total 2 columns):
content    4965 non-null object
label      4965 non-null object
dtypes: object(2)
memory usage: 77.7+ KB


In [13]:
labels = df['label'].value_counts()
labels

sentiment           2552
advertisement       1337
other topics         393
purchase             309
recruit              189
foreign language     184
oreign language        1
Name: label, dtype: int64

# Resampling

In [14]:
from sklearn.utils import resample

In [15]:
df_majority = df[df.label=='sentiment']

In [16]:
df_imbalanced1 = df[df.label=='advertisement']
df_imbalanced2 = df[df.label=='other topics']
df_imbalanced3 = df[df.label=='purchase']
df_imbalanced4 = df[df.label=='recruit']

In [17]:
df_unsampled1 = resample(df_imbalanced1, replace=True, n_samples=2552, random_state=None)
df_unsampled2 = resample(df_imbalanced2, replace=True, n_samples=2552, random_state=None)
df_unsampled3 = resample(df_imbalanced3, replace=True, n_samples=2552, random_state=None)
df_unsampled4 = resample(df_imbalanced4, replace=True, n_samples=2552, random_state=None)

In [18]:
df_unsampled = pd.concat([df_majority, df_unsampled1])
df_unsampled = pd.concat([df_unsampled, df_unsampled2])
df_unsampled = pd.concat([df_unsampled, df_unsampled3])
df_unsampled = pd.concat([df_unsampled, df_unsampled4])

In [19]:
df_unsampled.label.value_counts()

advertisement    2552
purchase         2552
recruit          2552
other topics     2552
sentiment        2552
Name: label, dtype: int64

In [20]:
def split_data(data, test_ratio):
    shuffled_id = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_ids = shuffled_id[:test_set_size]
    train_ids = shuffled_id[test_set_size:]
    return data.iloc[train_ids], data.iloc[test_ids]

# Model

In [21]:
train_set, test_set = split_data(df_unsampled, 0.2)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

In [23]:
text_clf = Pipeline([
    ('cv', CountVectorizer(stop_words=None, tokenizer=None,
                          preprocessor=None)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                         alpha=0.01, random_state=None,
                         max_iter=5, tol=None))
])

In [24]:
text_clf.fit(train_set.content, train_set.label)



Pipeline(memory=None,
     steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_a...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [25]:
pred = text_clf.predict(test_set.content)

In [26]:
np.mean(pred==test_set.label)

0.8761755485893417

In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
scores = cross_val_score(text_clf, train_set.content, train_set.label, cv=None)
scores.mean()



0.8693176505298602

In [29]:
from sklearn.metrics import f1_score

In [30]:
f1_score(test_set.label, pred, average='weighted')

0.8734612468407512

# Evaluate

In [31]:
def evaluate(labels, test):
    for s in labels:
        test_label = test[test.label==s]
        pred = text_clf.predict(test_label.content)
        print("Test {}: {}".format(s, np.mean(pred==test_label.label)))

In [32]:
labels = ['sentiment', 'advertisement', 'other topics', 'purchase', 'recruit']
evaluate(labels, test_set)

Test sentiment: 0.7358870967741935
Test advertisement: 0.8252788104089219
Test other topics: 0.8372549019607843
Test purchase: 0.984251968503937
Test recruit: 1.0
