In [2]:
import numpy as np
from typing import List, Tuple, Dict
import csv

In [3]:
arr=np.array([[1,3,4],[4,5,6]])
print(arr[0,[1,2]])

[3 4]


In [4]:
new_array=np.zeros((2,4))
print(new_array)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
# create_training_data() store training dataset under 2d array 

def create_training_data():
    
    # Create the training dataset for tennis prediction.
    return np.array([
        ['Sunny', 'Hot', 'High', 'Weak', 'No'],
        ['Sunny', 'Hot', 'High', 'Strong', 'No'],
        ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
        ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
        ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
        ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
        ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
        ['Overcast', 'Mild', 'High', 'Weak', 'No'],
        ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
        ['Rain', 'Mild', 'Normal', 'Weak', 'Yes']
    ])
    
training_data=create_training_data()
print(training_data.shape)

(10, 5)


In [17]:
def compute_prior_probabilities(train_data):
    """
    Calculate prior probabilities P(Play Tennis = Yes/No).
    Args:
        train_data: Training dataset
    Returns:
        Array of prior probabilities [P(No), P(Yes)]
    """
    
    class_names = ['No', 'Yes']
    total_samples = len(train_data)
    prior_probs = np.zeros(len(class_names))

    
    for i,class_name in enumerate(class_names):
        class_count=len(train_data[train_data[:,-1]==class_name])
        prior_probs[i]=class_count/total_samples
    return prior_probs
    
train_data=create_training_data()
print(compute_prior_probabilities(train_data))

[0.4 0.6]


In [22]:
def compute_conditional_probabilities(train_data):
    """
    Calculate conditional probabilities P(Feature|Class) for all features.

    Args:
        train_data: Training dataset

    Returns:
        Tuple of (conditional_probabilities, feature_values)
    """
    class_names = ['No', 'Yes']
    n_features = train_data.shape[1] - 1  # Exclude target column
    conditional_probs = []
    feature_values = []
    
    for feature_idx in range(n_features):
        unique_values=np.unique(train_data[:,feature_idx])
        feature_values.append(unique_values.tolist())
        
        feature_cond_probs = np.zeros((len(class_names), len(unique_values)))
        for class_idx, class_name in enumerate(class_names):
            feature_count=len(train_data[train_data[:,-1]==class_name])
            for idx, value in enumerate(unique_values):
                unique_feature_count=train_data[(train_data[:,feature_idx]==value) & (train_data[:,-1]==class_name)]
                feature_cond_probs[class_idx,idx]=len(unique_feature_count)/feature_count
        conditional_probs.append(feature_cond_probs)
    return conditional_probs,feature_values
    
train_data=create_training_data()
conditional_probs, feature_values  = compute_conditional_probabilities(train_data)
print("x1 = ",feature_values[0])
print("x2 = ",feature_values[1])
print("x3 = ",feature_values[2])
print("x4 = ",feature_values[3])

print(conditional_probs[0])
print(conditional_probs[1])
print(conditional_probs[2])
print(conditional_probs[3])

x1 =  ['Overcast', 'Rain', 'Sunny']
x2 =  ['Cool', 'Hot', 'Mild']
x3 =  ['High', 'Normal']
x4 =  ['Strong', 'Weak']
[[0.25       0.25       0.5       ]
 [0.33333333 0.5        0.16666667]]
[[0.25       0.5        0.25      ]
 [0.5        0.16666667 0.33333333]]
[[0.75       0.25      ]
 [0.33333333 0.66666667]]
[[0.5        0.5       ]
 [0.16666667 0.83333333]]


In [23]:
train_data=create_training_data()
def train_naive_bayes(train_data):
    prior_probabilities=compute_prior_probabilities(train_data)
    conditional_probabilities, feature_names  = compute_conditional_probabilities(train_data)
    return prior_probabilities, conditional_probabilities, feature_names
prior_probs, conditional_probs, feature_names = train_naive_bayes(train_data)
print(prior_probs)
print(conditional_probs)
print(feature_names)

[0.4 0.6]
[array([[0.25      , 0.25      , 0.5       ],
       [0.33333333, 0.5       , 0.16666667]]), array([[0.25      , 0.5       , 0.25      ],
       [0.5       , 0.16666667, 0.33333333]]), array([[0.75      , 0.25      ],
       [0.33333333, 0.66666667]]), array([[0.5       , 0.5       ],
       [0.16666667, 0.83333333]])]
[['Overcast', 'Rain', 'Sunny'], ['Cool', 'Hot', 'Mild'], ['High', 'Normal'], ['Strong', 'Weak']]


In [24]:
# Tìm vị trí của từng feature_value  ['Overcast', 'Rain', 'Sunny']
def get_feature_index(feature_value, feature_values):
    return np.where(feature_value==feature_values)[0][0]
print(get_feature_index("Sunny",np.array(['Overcast', 'Rain', 'Sunny'])))

2


In [None]:
def predict_tennis(
        X, prior_probabilities, conditional_probabilities, feature_names
    ):
    """
    X = ['Sunny','Cool', 'High', 'Strong']
    
    Make a prediction for given features.

    Args:
        X: List of feature values [Outlook, Temperature, Humidity, Wind]
        prior_probabilities: Prior probabilities for each class
        conditional_probabilities: Conditional probabilities for each feature
        feature_names: Names/values for each feature

    Returns:
        Tuple of (prediction, probabilities)
    """
    class_names = ['No', 'Yes']
    class_probabilities=[]
    for class_idx in range(len(class_names)):
        prob=prior_probabilities[class_idx]
        for feature_idx,feature_value in enumerate(X):
            index=get_feature_index(feature_value,np.array(feature_names[feature_idx]))
            prob*=conditional_probs[feature_idx][class_idx,index]
        class_probabilities.append(prob)
    
    # Optional    
    normalized_probs = [p for p in class_probabilities] 
    print(normalized_probs)
    predicted_idx=np.argmax(normalized_probs)
    prediction=class_names[predicted_idx]
    print(prediction)
        
X = ['Sunny','Cool', 'High', 'Strong']
prior_probs, conditional_probs, feature_names = train_naive_bayes(train_data)
prediction=predict_tennis(
    X, prior_probs, conditional_probs, feature_names
)
if prediction=="Yes":
    print("Ad should go!")
else:
    print("Ad should not go!")
    

[np.float64(0.018750000000000003), np.float64(0.002777777777777777)]
No
Ad should not go!


In [16]:
def load_training_data_from_csv(file_path):
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Bỏ dòng header
        data = []
        for row in reader:
            # Bỏ cột 'Day' (chỉ lấy từ Outlook đến PlayTennis)
            data.append(row[1:])  
        return np.array(data)
    
print(load_training_data_from_csv("./Data/data.csv"))

[['Sunny' 'Hot' 'High' 'Weak' 'No']
 ['Sunny' 'Hot' 'High' 'Strong' 'No']
 ['Overcast' 'Hot' 'High' 'Weak' 'Yes']
 ['Rain' 'Mild' 'High' 'Weak' 'Yes']
 ['Rain' 'Cool' 'Normal' 'Weak' 'Yes']
 ['Rain' 'Cool' 'Normal' 'Strong' 'No']
 ['Overcast' 'Cool' 'Normal' 'Strong' 'Yes']
 ['Overcast' 'Mild' 'High' 'Weak' 'No']
 ['Sunny' 'Cool' 'Normal' 'Weak' 'Yes']
 ['Rain' 'Mild' 'Normal' 'Weak' 'Yes']]
