# 英文部分

In [1]:
# 匯入套件
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# 假設我們有這些英文文本
docs = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# 創建 TF-IDF 範例
vectorizer = TfidfVectorizer()

# 進行 TF-IDF 轉換
X = vectorizer.fit_transform(docs)

# 顯示結果
print(vectorizer.get_feature_names_out())
print(X.toarray())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [3]:
# 透過 pandas 來預覽結果
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()); df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


# 中文部分

In [4]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer

# 假設我們有這些中文文本
documents = [
    '這是第一份文件',
    '這份文件是第二份文件',
    '而這是第三個',
    '這是第一份文件嗎'
]

# 先進行分詞
documents = [' '.join(jieba.cut(doc)) for doc in documents]

# 創建 TF-IDF 範例
vectorizer = TfidfVectorizer()

# 進行 TF-IDF 轉換
X = vectorizer.fit_transform(documents)

# 顯示結果
print(vectorizer.get_feature_names_out())
print(X.toarray())

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\darren\AppData\Local\Temp\jieba.cache
Loading model cost 0.630 seconds.
Prefix dict has been built successfully.


['份文件' '文件' '第一份' '第三' '第二份' '這是']
[[0.         0.53256952 0.65782931 0.         0.         0.53256952]
 [0.64450299 0.41137791 0.         0.         0.64450299 0.        ]
 [0.         0.         0.         0.84292635 0.         0.53802897]
 [0.         0.53256952 0.65782931 0.         0.         0.53256952]]


In [5]:
# 透過 pandas 來預覽結果
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()); df

Unnamed: 0,份文件,文件,第一份,第三,第二份,這是
0,0.0,0.53257,0.657829,0.0,0.0,0.53257
1,0.644503,0.411378,0.0,0.0,0.644503,0.0
2,0.0,0.0,0.0,0.842926,0.0,0.538029
3,0.0,0.53257,0.657829,0.0,0.0,0.53257
