In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [3]:
df = pd.read_csv('text_planets.csv')
print(f'データセットの形状: {df.shape}')
print(f'\nカテゴリの数:')
print(df['category'].value_counts())
df.head()

データセットの形状: (8, 2)

カテゴリの数:
category
Earth      1
Jupiter    1
Mars       1
Mercury    1
Neptune    1
Saturn     1
Uranus     1
Venus      1
Name: count, dtype: int64


Unnamed: 0,category,text
0,Earth,"Earth, our home planet, is the third planet fr..."
1,Jupiter,Jupiter is the fifth planet from the sun and t...
2,Mars,Mars is the fourth planet from the sun. It is ...
3,Mercury,Mercury is the closest planet to the sun and t...
4,Neptune,Neptune is the eighth planet from the sun and ...


In [4]:
X_train = df['text']
y_train = df['category']
print('訓練データ:')
print(X_train)
print('\nカテゴリ:')
print(y_train)

訓練データ:
0    Earth, our home planet, is the third planet fr...
1    Jupiter is the fifth planet from the sun and t...
2    Mars is the fourth planet from the sun. It is ...
3    Mercury is the closest planet to the sun and t...
4    Neptune is the eighth planet from the sun and ...
5    Saturn is the sixth planet from the sun and is...
6    Uranus is the seventh planet from the sun and ...
7    Venus is the second planet from the sun and is...
Name: text, dtype: object

カテゴリ:
0      Earth
1    Jupiter
2       Mars
3    Mercury
4    Neptune
5     Saturn
6     Uranus
7      Venus
Name: category, dtype: object


In [5]:
X_test = ['ring system only moon large circle']
print('テストデータ:')
print(X_test)

テストデータ:
['ring system only moon large circle']


In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer.get_feature_names_out()
print(f'語彙サイズ: {len(vocab)}')
print(f'最初の10単語: {vocab[:10]}')

語彙サイズ: 486
最初の10単語: ['000' '10' '13' '150' '1600s' '17' '18' '180' '1831' '195']


In [7]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('\nX_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<8x486 sparse matrix of type '<class 'numpy.int64'>'
	with 758 stored elements in Compressed Sparse Row format>

X_test_bow:
<1x486 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>


In [8]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow)

Unnamed: 0_level_0,000,10,13,150,1600s,17,18,180,1831,195,...,wide,winds,with,world,would,year,years,yet,you,zips
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Earth,1,0,0,0,0,0,1,0,0,0,...,0,0,1,2,0,0,0,0,0,1
Jupiter,1,1,0,1,0,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
Mars,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,1,0,0,0
Mercury,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
Neptune,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
Saturn,0,0,0,0,1,0,0,0,0,0,...,0,0,4,0,0,0,1,1,0,0
Uranus,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,3,0,0,0
Venus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


#### 単純ベイズ分類器の学習
事前確率は1/8ずつ(均等分布)、alpha=1.0でLaplace smoothingを適用

In [9]:
# 8つのカテゴリがあるので、事前確率は均等に1/8ずつ
class_prior = [1/8] * 8

model = MultinomialNB(alpha=1.0, class_prior=class_prior)
model.fit(X_train_bow, y_train)
print(f'分類クラス: {model.classes_}')
train_score = model.score(X_train_bow, y_train)
print(f'訓練データに対する精度: {train_score}')

分類クラス: ['Earth' 'Jupiter' 'Mars' 'Mercury' 'Neptune' 'Saturn' 'Uranus' 'Venus']
訓練データに対する精度: 1.0
訓練データに対する精度: 1.0


#### テストデータの予測と事後確率の計算

In [10]:
# 事後確率を計算(predict_probaは既に正規化された確率を返す)
proba = model.predict_proba(X_test_bow)

# 結果をDataFrameで表示
results = pd.DataFrame(proba, columns=model.classes_)
print('各カテゴリの事後確率:')
display(results)

# 予測結果(最も高い事後確率を持つカテゴリ)
prediction = model.predict(X_test_bow)
print(f'\n予測結果: {prediction[0]}')

各カテゴリの事後確率:


Unnamed: 0,Earth,Jupiter,Mars,Mercury,Neptune,Saturn,Uranus,Venus
0,0.026538,0.054017,0.009438,0.10171,0.016324,0.768497,0.009868,0.013608



予測結果: Saturn


#### 結果のまとめ(パーセント表示、小数第1位まで)

In [11]:
print('=' * 60)
print('【結果のまとめ】')
print('=' * 60)
print(f'テスト文書: "{X_test[0]}"')
print('\n各カテゴリの事後確率(%, 小数第1位まで):')
print('-' * 60)

# カテゴリをアルファベット順にソート
categories = sorted(model.classes_)
for i, category in enumerate(categories):
    # カテゴリのインデックスを取得
    idx = list(model.classes_).index(category)
    prob_percent = proba[0][idx] * 100
    print(f'{category:10s}: {prob_percent:5.1f}%')

print('-' * 60)
print(f'\n【予測結果】: {prediction[0]}')
print(f'(最も高い事後確率を示したカテゴリ)')
print('=' * 60)

# 検証: 確率の総和が1(100%)になることを確認
total_prob = proba[0].sum()
print(f'\n検証: 事後確率の総和 = {total_prob:.10f} (≈ 1.0)')

【結果のまとめ】
テスト文書: "ring system only moon large circle"

各カテゴリの事後確率(%, 小数第1位まで):
------------------------------------------------------------
Earth     :   2.7%
Jupiter   :   5.4%
Mars      :   0.9%
Mercury   :  10.2%
Neptune   :   1.6%
Saturn    :  76.8%
Uranus    :   1.0%
Venus     :   1.4%
------------------------------------------------------------

【予測結果】: Saturn
(最も高い事後確率を示したカテゴリ)

検証: 事後確率の総和 = 1.0000000000 (≈ 1.0)
