### COM410 - Video Lecture 5 Support Material
This code was developed during the Machine Learning class at UNIVESP.

In [None]:
import pandas as pd

In [None]:
class RuleGenerator:
    """
    RuleGenerator is a class designed to generate decision rules for a dataset
    based on attributes and their corresponding values. It calculates error rates
    for each attribute and determines the best attribute with the lowest error rate.

    Attributes:
        counters (dict): Tracks occurrences of attribute values for play=Yes/No.
        error_rates (dict): Stores error rates for each attribute.
        rules (dict): Stores generated rules for each attribute.
        min_error (float): Tracks the minimum error rate among attributes.
        best_attribute (str): The attribute with the lowest error rate.
        total_examples (int): Total number of examples in the dataset.

    Methods:
    
        __init__(): Initializes the RuleGenerator object with default values.
        load_data(filepath): Loads and processes the dataset from a CSV file.
        count_occurrences(df): Counts occurrences of attribute values for play=Yes/No.
        generate_rules(): Generates decision rules and calculates error rates.
        display_results(): Displays the generated rules, error rates, and best attribute.
    """
    def __init__(self):
        """
        Initialize the RuleGenerator object with default values.

        This constructor sets up the initial state of the RuleGenerator,
        including counters for attribute occurrences, error rates, rules,
        minimum error, the best attribute, and the total number of examples.
        """
        self.counters = {
            'outlook': {'Yes': [0, 0, 0], 'No': [0, 0, 0]},  # Sunny, Overcast, Rain
            'temperature': {'Yes': [0, 0, 0], 'No': [0, 0, 0]},  # Hot, Mild, Cool
            'humidity': {'Yes': [0, 0, 0], 'No': [0, 0, 0]},  # High, Normal, Low
            'wind': {'Yes': [0, 0, 0], 'No': [0, 0, 0]}  # Weak, Strong
        }
        
        self.error_rates = {
            'outlook': 0,
            'temperature': 0,
            'humidity': 0,
            'wind': 0
        }
        
        self.rules = {
            'outlook': "",
            'temperature': "",
            'humidity': "",
            'wind': ""
        }
        
        self.min_error = float('inf')
        self.best_attribute = ""
        self.total_examples = 0
        
    def load_data(self, filepath):
        """
        Load and process the dataset from a CSV file.

        Args:
            filepath (str): The path to the CSV file containing the dataset.

        Returns:
            pandas.DataFrame: The loaded dataset as a pandas DataFrame.
        """
        df = pd.read_csv(filepath)
        self.total_examples = len(df)
        return df
    
    def count_occurrences(self, df):
        """
        Count occurrences of attribute values for play=Yes/No.

        This method iterates through the dataset and updates the counters
        for each attribute value based on whether the play column is 'Yes' or 'No'.

        Args:
            df (pandas.DataFrame): The dataset containing the attributes and play column.
        """
        for _, row in df.iterrows():
            outlook = row['outlook']
            temp = row['temperature']
            humidity = row['humidity']
            wind = row['wind']
            play = row['play']
            
            # Map attribute values to indices
            outlook_idx = ['Sunny', 'Overcast', 'Rain'].index(outlook)
            temp_idx = ['Hot', 'Mild', 'Cool'].index(temp)
            humidity_idx = ['High', 'Normal'].index(humidity) if humidity in ['High', 'Normal'] else 2
            wind_idx = ['Weak', 'Strong'].index(wind) if wind in ['Weak', 'Strong'] else 2
            
            # Update counters
            self.counters['outlook'][play][outlook_idx] += 1
            self.counters['temperature'][play][temp_idx] += 1
            self.counters['humidity'][play][humidity_idx] += 1
            self.counters['wind'][play][wind_idx] += 1
    
    def generate_rules(self):
        """
        Generate decision rules and calculate error rates for each attribute.

        This method iterates through the counters for each attribute, generates
        decision rules based on the counts of 'Yes' and 'No' for each attribute value,
        calculates the error rates, and determines the best attribute with the lowest
        error rate.

        The generated rules and error rates are stored in the `rules` and `error_rates`
        attributes, respectively. The best attribute and its error rate are stored in
        `best_attribute` and `min_error`.
        """
        for attribute in self.counters:
            yes_counts = self.counters[attribute]['Yes']
            no_counts = self.counters[attribute]['No']
            
            # Get attribute values based on attribute type
            if attribute == 'outlook':
                values = ['Sunny', 'Overcast', 'Rain']
            elif attribute == 'temperature':
                values = ['Hot', 'Mild', 'Cool']
            elif attribute == 'humidity':
                values = ['High', 'Normal', 'Low']
            else:  # wind
                values = ['Weak', 'Strong', 'Medium']
            
            # Generate rules for each value
            for i in range(len(values)):
                if yes_counts[i] >= no_counts[i]:
                    self.rules[attribute] += f"If {values[i]} then play=Yes; "
                    self.error_rates[attribute] += no_counts[i]
                else:
                    self.rules[attribute] += f"If {values[i]} then play=No; "
                    self.error_rates[attribute] += yes_counts[i]
            
            # Calculate normalized error rate
            self.error_rates[attribute] /= self.total_examples
            
            # Check if this is the best attribute so far
            if self.error_rates[attribute] < self.min_error:
                self.min_error = self.error_rates[attribute]
                self.best_attribute = attribute
    
    def display_results(self):
        """
        Display the generated rules, error rates, and the best attribute.

        This method prints the decision rules for each attribute, their corresponding
        error rates, the best attribute with the lowest error rate, and the rules
        associated with the best attribute.
        """
        print("\nGenerated Rules:")
        for attribute in self.rules:
            print(f"\n{attribute.capitalize()} rules:")
            print(self.rules[attribute])
            print(f"Error rate: {self.error_rates[attribute]:.2f}")
        
        print("\nBest Attribute:")
        print(f"{self.best_attribute.capitalize()} with error rate {self.min_error:.2f}")
        print("\nBest Rules:")
        print(self.rules[self.best_attribute])

In [None]:
# Main execution
if __name__ == "__main__":
    generator = RuleGenerator()
    df = generator.load_data('../data/one-rule.csv')
    print("Dataset preview:")
    print(df.head())
    
    generator.count_occurrences(df)
    generator.generate_rules()
    generator.display_results()