In [1]:
from utilities import get_nodeid2text
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# load text data
nodeid2text, (train_idx, valid_idx, test_idx) = get_nodeid2text()
nodeid2text_train = nodeid2text.loc[train_idx]
nodeid2text_valid = nodeid2text.loc[valid_idx]
nodeid2text_test  = nodeid2text.loc[test_idx]

In [3]:
# convert the text into word counts
vectorizer = CountVectorizer()
vectorizer.fit(nodeid2text_train["text"])
X_train_counts = vectorizer.transform(nodeid2text_train["text"])

In [4]:
# normalize the word counts
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train = tf_transformer.transform(X_train_counts)

In [5]:
# train a Naive Bayes classifier on the word counts
clf = MultinomialNB().fit(X_train, nodeid2text_train["label"])

In [6]:
# evaluate the performance
train_pred = clf.predict(tf_transformer.transform(vectorizer.transform(nodeid2text_train["text"])))
valid_pred = clf.predict(tf_transformer.transform(vectorizer.transform(nodeid2text_valid["text"])))
test_pred = clf.predict(tf_transformer.transform(vectorizer.transform(nodeid2text_test["text"])))
print(f"Training Accuracy: {(train_pred == nodeid2text_train['label']).mean()}")
print(f"Validation Accuracy: {(valid_pred == nodeid2text_valid['label']).mean()}")
print(f"Test Accuracy: {(test_pred == nodeid2text_test['label']).mean()}")

Training Accuracy: 0.30238286361487116
Validation Accuracy: 0.3182992717876439
Test Accuracy: 0.29220418492685635
