In [11]:
###########################################################################
# 5강 : LogisticRegression : Iris의 품종 분류
###########################################################################
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

# Iris 데이터 로드
iris = load_iris()
X, y = iris.data, iris.target

# Feature 데이터와 Target 데이터를 DataFrame으로 변환
df = pd.DataFrame(data=X, columns=iris.feature_names)
df['target'] = y
# 0: setosa (아이리스 세토사)
# 1: versicolor (아이리스 버시칼러)
# 2: virginica (아이리스 버지니카)

# Logistic Regression 모델 반복 훈련
clf = LogisticRegression(random_state=0, max_iter=200).fit(X, y)

# 예측
predictions = clf.predict(X[:2, :])
probabilities = clf.predict_proba(X[:2, :])
score = clf.score(X, y)

# DataFrame 출력
print(df)

# 예측 결과 출력
print("Predictions: ", predictions)
print("Probabilities: ", probabilities)
print("Model Score: ", score)


     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

     target  
0         0  

In [7]:
###########################################################################
# 5강(추가) : LogisticRegression : Iris의 품종 분류 (경사하강법)
###########################################################################
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

# Iris 데이터 로드
iris = load_iris()
X, y = iris.data, iris.target

# Feature 데이터와 Target 데이터를 DataFrame으로 변환
df = pd.DataFrame(data=X, columns=iris.feature_names)
df['target'] = y
# 0: setosa (아이리스 세토사)
# 1: versicolor (아이리스 버시칼러)
# 2: virginica (아이리스 버지니카)

#################################################################################
# LogisticRegression에 경사하강법(saga) 사용
clf = LogisticRegression(random_state=0, max_iter=200, solver='saga').fit(X, y)
#################################################################################

# 예측
predictions = clf.predict(X[:2, :])
probabilities = clf.predict_proba(X[:2, :])
score = clf.score(X, y)

# DataFrame 출력
print(df)

# 예측 결과 출력
print("Predictions: ", predictions)
print("Probabilities: ", probabilities)
print("Model Score: ", score)

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8   

     target  
0         0  



In [3]:
###########################################################################
# 5강 : GaussianNB : Iris의 품종 분류
###########################################################################
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Iris 데이터셋 로드
X, y = load_iris(return_X_y=True)

# 원 데이터셋의 크기 출력
print(f"Total number of data points in the dataset: {X.shape[0]}")

# 학습용 데이터와 테스트용 데이터로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Gaussian Naive Bayes 모델 생성 및 학습
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

# 오분류된 데이터의 개수 출력
mislabeled_points = (y_test != y_pred).sum()
print("Number of mislabeled points out of a total %d points: %d" 
      % (X_test.shape[0], mislabeled_points))

# 정확도 계산 및 출력
accuracy = (X_test.shape[0] - mislabeled_points) / X_test.shape[0]
print(f"Accuracy: {accuracy:.4f}")


Total number of data points in the dataset: 150
Number of mislabeled points out of a total 75 points: 4
Accuracy: 0.9467


In [5]:
###########################################################################
# 5강(추가) : GaussianNB : Iris의 품종 분류(데이터 스케일링)
###########################################################################

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Iris 데이터셋 로드
X, y = load_iris(return_X_y=True)

# 원 데이터셋의 크기 출력
print(f"Total number of data points in the dataset: {X.shape[0]}")

# 학습용 데이터와 테스트용 데이터로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

################################################ 
# 데이터 스케일링 (평균을 0, 분산을 1로 맞춤)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
################################################

# Gaussian Naive Bayes 모델 생성 및 학습
gnb = GaussianNB()
y_pred = gnb.fit(X_train_scaled, y_train).predict(X_test_scaled)

# 오분류된 데이터의 개수 출력
mislabeled_points = (y_test != y_pred).sum()
print("Number of mislabeled points out of a total %d points: %d" 
      % (X_test.shape[0], mislabeled_points))

# 정확도 계산 및 출력
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Total number of data points in the dataset: 150
Number of mislabeled points out of a total 75 points: 4
Accuracy: 0.9467
