# CÂU 1

In [1]:
import numpy as np

In [19]:
data = np.genfromtxt('iris.data', delimiter=',', dtype=None, encoding=None)


print(type(data), data.shape) 

data = np.array([list(row) for row in data])
print(data.shape)  

X = data[:, :4].astype(float)
y = data[:, 4]

classes = np.unique(y)
y_num = np.array([np.where(classes == label)[0][0] for label in y])

<class 'numpy.ndarray'> (150,)
(150, 5)


In [20]:
print("X shape:", X.shape)
print("y_num shape:", y_num.shape)

X shape: (150, 4)
y_num shape: (150,)


In [21]:
np.random.seed(42)
idx = np.random.permutation(len(X))
train_idx = idx[:int(0.7*len(X))]
test_idx  = idx[int(0.7*len(X)):]

X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

In [22]:
classes = np.unique(y)  # string
priors = {}
mean = {}
var = {}
for c in classes:
    X_c = X_train[y_train == c]  # boolean mask
    priors[c] = len(X_c) / len(X_train)
    mean[c] = X_c.mean(axis=0)
    var[c] = X_c.var(axis=0)

In [23]:
def pdf(x, mu, sigma2):
    # sigma2 = variance
    coeff = 1.0 / np.sqrt(2.0 * np.pi * sigma2)
    exponent = np.exp(- (x - mu)**2 / (2.0 * sigma2))
    return coeff * exponent


In [24]:
def predict_one(x):
    log_probs = {}
    for c in classes:
        log_prob = np.log(priors[c])
        # cộng log xác suất của mỗi feature
        log_prob += np.sum(np.log(pdf(x, mean[c], var[c])))
        log_probs[c] = log_prob
    return max(log_probs, key=log_probs.get)

In [25]:
y_pred = np.array([predict_one(x) for x in X_test])

In [9]:
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9555555555555556


In [10]:
for i in range(5):
    print(f"X_test[{i}] = {X_test[i]}, Predicted = {y_pred[i]}, Actual = {y_test[i]}")

X_test[0] = [4.6 3.4 1.4 0.3], Predicted = Iris-setosa, Actual = Iris-setosa
X_test[1] = [6.8 3.  5.5 2.1], Predicted = Iris-virginica, Actual = Iris-virginica
X_test[2] = [6.3 3.3 6.  2.5], Predicted = Iris-virginica, Actual = Iris-virginica
X_test[3] = [4.7 3.2 1.3 0.2], Predicted = Iris-setosa, Actual = Iris-setosa
X_test[4] = [6.1 2.9 4.7 1.4], Predicted = Iris-versicolor, Actual = Iris-versicolor


# CÂU 2

In [11]:
import numpy as np

# đọc file datafile
data = np.genfromtxt('letter-recognition.data', delimiter=',', dtype=str)

# tách label và feature
y = data[:, 0]  # labels A-Z
X = data[:, 1:].astype(int)  # 16 features

In [12]:
np.random.seed(42)
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
n_train = int(0.8 * X.shape[0])
train_idx = indices[:n_train]
test_idx  = indices[n_train:]

X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx],  y[test_idx]

In [13]:
classes, class_counts = np.unique(y_train, return_counts=True)
n_classes = len(classes)
class_probs = class_counts / float(y_train.size)

In [14]:
n_features = X_train.shape[1]
V = X_train.max() + 1 

In [15]:
feature_probs = np.zeros((n_classes, n_features, V), dtype=float)

for ci, c in enumerate(classes):
    # chọn các mẫu thuộc lớp c
    X_c = X_train[y_train == c]
    # với mỗi feature i
    for i in range(n_features):
        # đếm tần suất mỗi giá trị của feature i trong lớp c
        counts = np.bincount(X_c[:, i], minlength=V)
        # Laplace smoothing: (counts + 1) / (total_counts + V)
        feature_probs[ci, i, :] = (counts + 1) / (counts.sum() + V)

In [16]:
def predict(X_new):
    y_pred = []
    for x in X_new:
        # tính log‑score cho mỗi lớp
        log_scores = []
        for ci, c in enumerate(classes):
            log_score = np.log(class_probs[ci])
            for i in range(n_features):
                # giá trị feature i là x[i], lấy xác suất tương ứng
                log_score += np.log(feature_probs[ci, i, x[i]])
            log_scores.append(log_score)
        # chọn lớp có log_score lớn nhất
        y_pred.append(classes[np.argmax(log_scores)])
    return np.array(y_pred)

In [17]:
y_pred = predict(X_test)
accuracy = np.mean(y_pred == y_test)
print("Accuracy = {:.4f}".format(accuracy))

Accuracy = 0.7278


In [18]:
sample = np.array([[2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8],
                   [5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10]])  

pred_sample = predict(sample)
for i, p in enumerate(pred_sample):
    print(f"Mẫu {i+1} dự đoán là: {p}")

Mẫu 1 dự đoán là: T
Mẫu 2 dự đoán là: Q
