This repository has been archived by the owner on Sep 27, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
bm25.py
92 lines (83 loc) · 3.18 KB
/
bm25.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# coding: UTF-8
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency
class BM25Transformer(BaseEstimator, TransformerMixin):
"""
Parameters
----------
use_idf : boolean, optional (default=True)
k1 : float, optional (default=2.0)
b : float, optional (default=0.75)
References
----------
Okapi BM25: a non-binary model - Introduction to Information Retrieval
http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
"""
def __init__(self, use_idf=True, k1=2.0, b=0.75):
self.use_idf = use_idf
self.k1 = k1
self.b = b
def fit(self, X):
"""
Parameters
----------
X : sparse matrix, [n_samples, n_features]
document-term matrix
"""
if not sp.issparse(X):
X = sp.csc_matrix(X)
if self.use_idf:
n_samples, n_features = X.shape
df = _document_frequency(X)
idf = np.log((n_samples - df + 0.5) / (df + 0.5))
self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features)
return self
def transform(self, X, copy=True):
"""
Parameters
----------
X : sparse matrix, [n_samples, n_features]
document-term matrix
copy : boolean, optional (default=True)
"""
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
# preserve float family dtype
X = sp.csr_matrix(X, copy=copy)
else:
# convert counts or binary occurrences to floats
X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
n_samples, n_features = X.shape
# Document length (number of terms) in each row
# Shape is (n_samples, 1)
dl = X.sum(axis=1)
# Number of non-zero elements in each row
# Shape is (n_samples, )
sz = X.indptr[1:] - X.indptr[0:-1]
# In each row, repeat `dl` for `sz` times
# Shape is (sum(sz), )
# Example
# -------
# dl = [4, 5, 6]
# sz = [1, 2, 3]
# rep = [4, 5, 5, 6, 6, 6]
rep = np.repeat(np.asarray(dl), sz)
# Average document length
# Scalar value
avgdl = np.average(dl)
# Compute BM25 score only for non-zero elements
data = X.data * (self.k1 + 1) / (X.data + self.k1 * (1 - self.b + self.b * rep / avgdl))
X = sp.csr_matrix((data, X.indices, X.indptr), shape=X.shape)
if self.use_idf:
check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
expected_n_features = self._idf_diag.shape[0]
if n_features != expected_n_features:
raise ValueError("Input has n_features=%d while the model"
" has been trained with n_features=%d" % (
n_features, expected_n_features))
# *= doesn't work
X = X * self._idf_diag
return X