## Import pyserini

In [1]:
from pyserini.search import pysearch
from pyserini.index import pyutils
from pyserini.analysis.pyanalysis import get_lucene_analyzer
import re

## Download Dataset

In [2]:
# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
from sklearn.datasets import fetch_20newsgroups

data_subset_names = ['train', 'test']
data = {}

for data_subset_name in data_subset_names:
    data_subset = fetch_20newsgroups(subset=data_subset_name)
    data[data_subset_name] = data_subset
    num_docs = len(data_subset.data)
    num_categories = len(data_subset.target_names)
    print(f"{data_subset_name}: {num_docs} documents in {num_categories} categories")

# find data @ ~/scikit_learn_data/20news-bydate_py3.pkz

  from collections import Mapping, defaultdict


train: 11314 documents in 20 categories
test: 7532 documents in 20 categories


In [3]:
from pyserini.index import pyutils
from scipy.sparse import csr_matrix
import numpy as np
import math


class AnseriniTwentyNewsgroupTfidfVectorizer:
    def __init__(self, index_path,  train_data, test_data):
        self.index_utils = pyutils.IndexReaderUtils(index_path)
        self.train_data = train_data
        self.test_data = test_data
        self.train_size = len(train_data.data)
        self.test_size = len(test_data.data)
        self.get_unique_terms()

    def get_unique_terms(self):
        unique_words = set()
        self.terms_df = {}
        for term in self.index_utils.terms():
            unique_words.add(term.term)
            self.terms_df[term.term] = term.df

        self.word_to_index = {}
        for index, term in enumerate(unique_words):
            self.word_to_index[term] = index
        self.num_unique_words = len(unique_words)
        print(f'Found {self.num_unique_words} unique words')

    def get_doc_id(self, filename):
        return filename.split('/')[-1]

    def get_train_vectors(self):
        features = self.get_vectors(self.train_data.filenames)
        return features, self.train_data.target

    def get_test_vectors(self):
        features = self.get_vectors(self.test_data.filenames)
        return features, self.test_data.target

    def get_vectors(self, filenames):
        matrix_row, matrix_col, matrix_data = [], [], []

        for index, filename in enumerate(filenames):
            doc_id = self.get_doc_id(filename)
            if index % 2000 == 0:
                print(f'Vectorizing: {index}/{len(filenames)}')

            # Term Frequency
            tf = self.index_utils.get_document_vector(doc_id)

            # Inverse Document Frequency
            df = {t: math.log(
                self.num_unique_words / self.terms_df[t]) for t in tf}

            # Multiplication
            total_num_terms_in_doc = sum(tf.values())
            tfidf = {t: df[t] * tf[t] / total_num_terms_in_doc for t in tf}

            # Convert from dict to sparse matrix
            for term in tfidf:
                i = self.word_to_index[term]
                matrix_row.append(index)
                matrix_col.append(i)
                matrix_data.append(tfidf[term])

        return csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(
            len(filenames), self.num_unique_words))


In [4]:
vectorizer = AnseriniTwentyNewsgroupTfidfVectorizer(
    './20-newsgroup/lucene-index.20newsgroup.pos+docvectors+raw',
    data['train'],
    data['test'],
)

Found 165633 unique words


In [5]:
features, labels = vectorizer.get_train_vectors()

Vectorizing: 0/11314
Vectorizing: 2000/11314
Vectorizing: 4000/11314
Vectorizing: 6000/11314
Vectorizing: 8000/11314
Vectorizing: 10000/11314


In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB(alpha=.01)
clf.fit(features, labels)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [7]:
test_vectors, test_target = vectorizer.get_test_vectors()

Vectorizing: 0/7532
Vectorizing: 2000/7532
Vectorizing: 4000/7532
Vectorizing: 6000/7532


In [8]:
pred = clf.predict(test_vectors)
metrics.f1_score(test_target, pred, average='macro')

0.5935516287791174