# Basic knowledge

## Bag-of-words model

ref:   
[link](https://www.bookstack.cn/read/duoergun0729-nlp/%E8%AF%8D%E8%A2%8B%E6%A8%A1%E5%9E%8B%E5%92%8CTFIDF%E6%A8%A1%E5%9E%8B.md)  
[sklearn text-feature-extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'I like the first document.',
    'He likes the first document.',
]

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)

print(X.toarray())
vectorizer.vocabulary_


[[0 1 1 0 1 0 0 0 0 1 0 1]
 [0 1 0 0 1 0 0 0 2 1 0 1]
 [1 0 0 0 0 0 0 1 0 1 1 0]
 [0 1 1 0 1 0 0 0 0 1 0 1]
 [0 1 1 0 0 1 0 0 0 1 0 0]
 [0 1 1 1 0 0 1 0 0 1 0 0]]


{'this': 11,
 'is': 4,
 'the': 9,
 'first': 2,
 'document': 1,
 'second': 8,
 'and': 0,
 'third': 10,
 'one': 7,
 'like': 5,
 'he': 3,
 'likes': 6}

In [18]:
vectorizer2 = CountVectorizer(min_df=1, stop_words='english')
X = vectorizer2.fit_transform(corpus)

print(X.toarray())
print(vectorizer2.vocabulary_)
vectorizer2.get_feature_names()

[[1 0 0 0]
 [1 0 0 2]
 [0 0 0 0]
 [1 0 0 0]
 [1 1 0 0]
 [1 0 1 0]]
{'document': 0, 'second': 3, 'like': 1, 'likes': 2}


['document', 'like', 'likes', 'second']

In [21]:
!pip list

Package                  Version
------------------------ -----------
absl-py                  0.11.0
anyio                    2.1.0
appdirs                  1.4.4
appnope                  0.1.2
APScheduler              3.6.3
argon2-cffi              20.1.0
astunparse               1.6.3
async-generator          1.10
attrs                    20.3.0
Babel                    2.9.0
backcall                 0.2.0
backports.zoneinfo       0.2.1
black                    20.8b1
bleach                   3.3.0
boto3                    1.24.81
botocore                 1.27.81
CacheControl             0.12.6
cachetools               4.2.2
cachy                    0.3.0
certifi                  2020.12.5
cffi                     1.14.5
chardet                  4.0.0
cleo                     0.8.1
click                    7.1.2
clikit                   0.6.2
crashtest                0.3.1
cycler                   0.10.0
decorator                4.4.2
defusedxml               0.6.0
dill             

## tf-idf

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(smooth_idf=False)

In [24]:
# Use the corpus from above again 
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'I like the first document.',
    'He likes the first document.',
]

vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
term_counts = X.toarray()
term_counts

array([[0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0]])

In [26]:
tfidf = transformer.fit_transform(term_counts)


<6x12 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [27]:
tfidf.toarray()

array([[0.        , 0.37190386, 0.44209453, 0.        , 0.53258605,
        0.        , 0.        , 0.        , 0.        , 0.3145539 ,
        0.        , 0.53258605],
       [0.        , 0.18858203, 0.        , 0.        , 0.27005947,
        0.        , 0.        , 0.        , 0.89057951, 0.15950148,
        0.        , 0.27005947],
       [0.56538652, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.56538652, 0.        , 0.20251978,
        0.56538652, 0.        ],
       [0.        , 0.37190386, 0.44209453, 0.        , 0.53258605,
        0.        , 0.        , 0.        , 0.        , 0.3145539 ,
        0.        , 0.53258605],
       [0.        , 0.3389545 , 0.40292653, 0.        , 0.        ,
        0.80035708, 0.        , 0.        , 0.        , 0.28668554,
        0.        , 0.        ],
       [0.        , 0.26463289, 0.31457796, 0.62486502, 0.        ,
        0.        , 0.62486502, 0.        , 0.        , 0.22382481,
        0.        ,

In [30]:
transformer.idf_

array([2.79175947, 1.18232156, 1.40546511, 2.79175947, 1.69314718,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 1.        ,
       2.79175947, 1.69314718])

In [None]:
# New in Pandas version 1.0
#transformer.feature_names_in_