In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import re

In [3]:
filepath = '/home/anhduc/Downloads/Telegram Desktop/traindatatopic.txt'

In [4]:
f = open(filepath,encoding='utf8')

In [None]:
str = f.read()

# Preprocessing

In [None]:
label = re.findall(r'<label>(.*?)</label>', str)
content = re.findall(r'<content>\s*((?:.|\n)*?)</content>', str)
print(len(label))
print(len(content))

4965
4965


In [None]:
for i in range(len(content)):
    content[i] = re.sub(r'[^\w\s]','',content[i])

In [None]:
from underthesea import word_tokenize

In [None]:
for i in range(len(content)):
    content[i] = word_tokenize(content[i], format="text")

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame()
df['content'] = content
df['label'] = label

In [None]:
df.info()

In [None]:
labels = df['label'].value_counts()
labels

# Resampling

In [None]:
from sklearn.utils import resample

In [None]:
df_majority = df[df.label=='sentiment']

In [None]:
df_imbalanced1 = df[df.label=='advertisement']
df_imbalanced2 = df[df.label=='other topics']
df_imbalanced3 = df[df.label=='purchase']
df_imbalanced4 = df[df.label=='recruit']

In [None]:
df_unsampled1 = resample(df_imbalanced1, replace=True, n_samples=2552, random_state=None)
df_unsampled2 = resample(df_imbalanced2, replace=True, n_samples=2552, random_state=None)
df_unsampled3 = resample(df_imbalanced3, replace=True, n_samples=2552, random_state=None)
df_unsampled4 = resample(df_imbalanced4, replace=True, n_samples=2552, random_state=None)

In [None]:
df_unsampled = pd.concat([df_majority, df_unsampled1])
df_unsampled = pd.concat([df_unsampled, df_unsampled2])
df_unsampled = pd.concat([df_unsampled, df_unsampled3])
df_unsampled = pd.concat([df_unsampled, df_unsampled4])

In [None]:
df_unsampled.label.value_counts()

In [None]:
def split_data(data, test_ratio):
    shuffled_id = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_ids = shuffled_id[:test_set_size]
    train_ids = shuffled_id[test_set_size:]
    return data.iloc[train_ids], data.iloc[test_ids]

# Model

In [None]:
train_set, test_set = split_data(df_unsampled, 0.2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [None]:
text_clf = Pipeline([
    ('cv', CountVectorizer(stop_words=None, tokenizer=None,
                          preprocessor=None)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(SGDClassifier(loss='hinge', penalty='l2',
                         alpha=0.01, random_state=None,
                         max_iter=5, tol=None)))
])

In [None]:
text_clf.fit(train_set.content, train_set.label)

In [None]:
pred = text_clf.predict(test_set.content)

In [None]:
np.mean(pred==test_set.label)

# Evaluate

Cross_val_score

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(text_clf, train_set.content, train_set.label, cv=None)
scores.mean()

F1_score

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(test_set.label, pred, average='weighted')

In [None]:
def evaluate(labels, test):
    for s in labels:
        test_label = test[test.label==s]
        pred = text_clf.predict(test_label.content)
        print("Test {}: {}".format(s, np.mean(pred==test_label.label)))

In [None]:
labels = ['sentiment', 'advertisement', 'other topics', 'purchase', 'recruit']
evaluate(labels, test_set)

Roc_curve 

In [None]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import label_binarize

In [None]:
y_train = label_binarize(train_set.label, classes=['sentiment', 'advertisement', 'other topics', 'purchase', 'recruit'])

In [None]:
y_scores = cross_val_predict(text_clf, train_set.content, train_set.label, cv=3, method="decision_function")

In [None]:
y_scores

In [None]:
n_classes = 5
fpr = dict()
tpr = dict()
roc_cur = dict()
for i in range(n_classes):
    fpr[i], tpr[i], thresholds = roc_curve(y_train[:,i], y_scores[:,i]) 

Graph

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_roc_curve(fpr, tpr, labels, n_classes):
    plt.figure()
    plt.title("Roc_curve")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], linewidth=2, label=labels[i])
    plt.legend(loc="lower right")
    plt.show()

In [None]:
plot_roc_curve(fpr, tpr, labels, n_classes)