In [124]:
from sklearn.datasets import make_classification

# Generate a 2-class classification dataset with 5 features and 200 instances
X, y = make_classification(n_samples=200, n_features=50, n_informative=2,  n_classes=2, random_state=42)

In [2]:
from AITIA.heuristics import DisjunctSize, DisjunctClassPercentage, KDisagreeingNeighbors

In [127]:
import numpy as np
from scipy.stats import norm
import pandas as pd

class ClassLikelihoodDifference:

    def __init__(self):
        self.data = None
        self.classes = None

    def fit(self, X, y, categorical_idx=[]):
        """
        Fit the Likelihood class with the dataset.

        Parameters:
        X (array-like): A 2D array (MxN) representing instances with features.
        y (array-like): An array containing the class labels corresponding to each instance in X.

        Returns:
        None
        """
        if len(X) != len(y):
            raise ValueError("X and y must have the same number of instances.")
        
        # Store the data description in a dictionary format
        self.data = self.class_stats(X, y, categorical_idx)
        self.classes = np.unique(y)

    def calculate(self, X, y):
        """
        Calculate the class likelihood difference for a list of new instances based on the training set statistics.

        Parameters:
        X (array-like): List of new instances for which class likelihood differences need to be calculated.
        y (array-like): The target labels corresponding to the dataset.

        Returns:
        list: A list of class likelihood differences for each input instance in X.
        """

        # Check if X is a supported data type
        if not isinstance(X, (np.ndarray, pd.DataFrame, pd.Series)):
            raise ValueError("X must be a NumPy array, pandas DataFrame, or pandas Series.")

        # Convert X to a NumPy array if it's a DataFrame or Series
        if isinstance(X, (pd.DataFrame, pd.Series)):
            X = X.values
        
        # Ensure X is 2-dimensional in shape
        if X.ndim == 1:
            X = X.reshape(-1, 1)

        # Initialize a list to store the likelihood_differences for new instances
        likelihood_difference = []
        
        for instance, instance_class in zip(X, y):
            # Calculate the disagreeing neighbors scores for the instances
            likelihood_difference.append(self.class_likelihood_difference(instance, instance_class))

        return likelihood_difference
    
    def class_stats(self, X, y, categorical_idx):
        # Get the unique class labels from the 'y' array
        num_classes = np.unique(y)
        # Get the number of features in the input data 'X'
        num_features = X.shape[1]

        # Create an empty dictionary to store the class statistics
        class_dict = {}

        for feature in range(num_features):
            # Check if the current feature is in the list of categorical features
            if feature in categorical_idx:
                feature_dict = {'type': 'categorical', 'counts': {}}
                all_categories = np.unique(X[:, feature])
                
                for class_val in num_classes:
                    x_class = X[y == class_val, feature]
                    
                    # Calculate the counts of unique categories and store in a dictionary
                    unique, counts = np.unique(x_class, return_counts=True)
                    class_counts = dict(zip(unique, counts))
                    
                    # Ensure that every possible category is included, even if count is 0
                    for category in all_categories:
                        if category not in class_counts:
                            class_counts[category] = 0
                    
                    feature_dict['counts'][class_val] = class_counts
                    
                class_dict[feature] = feature_dict
            else:
                # If it's continuous, create a dictionary to store mean and standard deviation
                feature_dict = {'type': 'continuous', 'mean': {}, 'std': {}}
                
                for class_val in num_classes:
                    x_class = X[y == class_val, feature]
                    
                    # Calculate the mean and standard deviation and store in the dictionary
                    feature_dict['mean'][class_val] = np.mean(x_class)
                    feature_dict['std'][class_val] = np.std(x_class)
                
                class_dict[feature] = feature_dict

        return class_dict
    
    def class_likelihood(self, instance, target_class):
        """
        Calculate the class likelihood for an instance belonging to a certain class.

        Parameters:
        instance (list or array): A 1-D array or list representing the instance for which to calculate the class likelihood.
        target_class (str): The class label for which to calculate the likelihood.

        Returns:
        float: The class likelihood for the given instance and class.
        """

        likelihood = 1.0
        for idx, feature in self.data.items():
            if feature['type'] == 'continuous':
                likelihood *= norm.cdf(instance[idx], loc=feature['mean'][target_class], scale=feature['std'][target_class])
            else:
                if instance[idx] not in self.data[idx]['counts'][target_class]:
                    raise ValueError(f"Category {instance[idx]} not found in training set for feature {idx}")
                class_total = 0
                for class_val in self.data[idx]['counts'].keys():
                    class_total += self.data[idx]['counts'][class_val][instance[idx]]
                likelihood *= self.data[idx]['counts'][target_class][instance[idx]]/class_total

        return likelihood
    
    def class_likelihood_difference(self, instance, instance_class):
        """
        Calculate the class likelihood difference for an instance belonging to a certain class.

        Parameters:
        instance (list or array): A 1-D array or list representing the instance for which to calculate the class likelihood difference.
        instance_class (any): The target label corresponding to the instance.

        Returns:
        float: The class likelihood difference for the given instance and class.
        """
        # Calculate class likelihood for the instance's actual class
        likelihood_actual = self.class_likelihood(instance, instance_class)
        
        # Calculate class likelihood for all other classes
        likelihood_other = [self.class_likelihood(instance, class_label) for class_label in self.classes if class_label != instance_class]
        print(likelihood_actual, likelihood_other)
        # Calculate the difference between the actual class likelihood and the maximum likelihood of other classes
        likelihood_difference = likelihood_actual - max(likelihood_other)
        
        return likelihood_difference

In [33]:
X[:,1] = X[:,1].astype(int)

In [105]:
X_[:,1] = X_[:,1].astype(int)


In [128]:
test = Likelihood()
test.fit(X, y)
np.array(test.calculate(X,y))

1.0635197690176396e-20 [9.461117441844376e-26]
2.4813202693284795e-22 [2.322776168251409e-25]
4.0946767309735743e-19 [8.211143842612573e-22]
1.3183953619552698e-21 [3.968557008244073e-24]
8.6839576008797e-24 [1.1246915077282813e-25]
2.279843045940032e-23 [1.8285839125856967e-27]
1.4681548906335292e-20 [2.92822041818142e-21]
1.9078516488274347e-22 [4.243834965134074e-22]
3.1921610078626238e-21 [2.3212502621647416e-22]
1.3216916942638696e-26 [1.4722987856040797e-27]
2.20855813377085e-25 [1.023554635845725e-29]
1.3584142308977068e-24 [1.471683860791619e-25]
2.3849249804452355e-22 [7.527125807080779e-23]
1.13984550315395e-18 [6.189637615300281e-20]
4.9061980752711246e-20 [1.2990092416564226e-20]
8.978771619086836e-18 [1.2507938888179365e-18]
4.396704529714967e-20 [3.641193690278124e-23]
2.2764382776579868e-24 [8.103578777520989e-25]
4.237192350150522e-17 [2.9861666782860645e-18]
7.183185157888985e-24 [7.615787581409235e-25]
5.350664016474771e-24 [1.6566253790625766e-25]
1.6269011069495279e

array([ 1.06351031e-20,  2.47899749e-22,  4.08646559e-19,  1.31442680e-21,
        8.57148845e-24,  2.27966019e-23,  1.17533285e-20, -2.33598332e-22,
        2.96003598e-21,  1.17446182e-26,  2.20845578e-25,  1.21124584e-24,
        1.63221240e-22,  1.07794913e-18,  3.60718883e-20,  7.72797773e-18,
        4.39306334e-20,  1.46608040e-24,  3.93857568e-17,  6.42160640e-24,
        5.18500148e-24,  5.30648059e-22, -3.41843826e-21,  5.91955192e-23,
       -3.93846222e-25,  8.00472924e-23,  2.46428719e-21,  9.94888753e-23,
        2.47051594e-20,  1.16154372e-20,  4.52314153e-19,  1.40689219e-21,
        1.96347104e-30,  4.63124788e-19,  9.52991553e-27,  2.97307447e-26,
       -4.58628141e-19,  9.48272622e-27,  2.43673594e-25, -1.08397379e-23,
        2.33920132e-21, -4.66022174e-23, -1.98865002e-20, -1.56520845e-28,
        1.10165695e-20,  5.17517503e-22,  1.58915486e-21,  1.75209629e-25,
        6.81258959e-24, -7.51868370e-22,  1.16182718e-22,  2.44474459e-18,
        8.40725452e-21, -

In [98]:
test.data

{0: {'type': 'continuous',
  'mean': {0: -0.2764548269369244, 1: 0.18599934121329492},
  'std': {0: 0.7729954432823484, 1: 0.7069470696067879}},
 1: {'type': 'categorical',
  'counts': {0: {-1.0: 1, 0.0: 94, 1.0: 3, 2.0: 1},
   1: {-1.0: 4, 0.0: 97, 1.0: 0, 2.0: 0}}},
 2: {'type': 'continuous',
  'mean': {0: -0.08900637753331496, 1: -0.03270705991201869},
  'std': {0: 0.9846292273840472, 1: 1.0320470195747493}},
 3: {'type': 'continuous',
  'mean': {0: -0.9434349784126167, 1: 0.9084386235287324},
  'std': {0: 1.0039145522153368, 1: 0.8131474981448837}},
 4: {'type': 'continuous',
  'mean': {0: 0.09869316172668705, 1: 0.05274444844312825},
  'std': {0: 1.3233479741415304, 1: 1.4018248459750393}}}

In [97]:
test.class_likelihood(np.array([0.5,-1,0.2,0.3,0.4]),1)

0.042984838529665136

In [70]:
x_val = 0.0
class_total = 0
for class_val in test.data[1]['counts'].keys():
    class_total += test.data[1]['counts'][class_val][x_val]

In [71]:
class_total

191

In [68]:
test.data[1]['counts'][class_val][x_val]

94

In [65]:
test.data[1]['counts'][class_val][x_val]

0

In [58]:
total = [v for v in test.data[1]['counts']]

{0: {-1.0: 1, 0.0: 94, 1.0: 3, 2.0: 1}, 1: {-1.0: 4, 0.0: 97, 1.0: 0, 2.0: 0}}

In [55]:
for feature in test.data.values():
    print(feature['mean'])

-0.2764548269369244


KeyError: 'mean'

In [3]:
test = KDisagreeingNeighbors()
test.fit(X,y,n_neighbors=11)
test.calculate(X_,y_)

[0.45454545454545453,
 0.6363636363636364,
 0.45454545454545453,
 0.6363636363636364,
 0.9090909090909091,
 0.8181818181818182,
 0.45454545454545453,
 0.36363636363636365,
 0.6363636363636364,
 0.2727272727272727,
 0.7272727272727273,
 0.7272727272727273,
 0.6363636363636364,
 0.45454545454545453,
 0.7272727272727273,
 0.2727272727272727,
 0.5454545454545454,
 0.5454545454545454,
 0.6363636363636364,
 0.6363636363636364,
 0.5454545454545454,
 0.36363636363636365,
 0.6363636363636364,
 0.36363636363636365,
 0.6363636363636364,
 0.6363636363636364,
 0.6363636363636364,
 0.36363636363636365,
 0.36363636363636365,
 0.7272727272727273,
 0.7272727272727273,
 0.9090909090909091,
 0.45454545454545453,
 0.6363636363636364,
 1.0,
 0.45454545454545453,
 0.45454545454545453,
 0.0,
 0.5454545454545454,
 0.7272727272727273,
 0.8181818181818182,
 0.7272727272727273,
 0.5454545454545454,
 0.6363636363636364,
 0.5454545454545454,
 0.6363636363636364,
 0.45454545454545453,
 0.7272727272727273,
 0.454545

In [8]:
test = DisjunctSize()
test.fit(X,y)
test.calculate(X_)

[0.007547169811320755,
 0.2641509433962264,
 0.19622641509433963,
 0.21132075471698114,
 0.2641509433962264,
 0.0037735849056603774,
 0.03018867924528302,
 0.033962264150943396,
 0.0037735849056603774,
 0.8603773584905661,
 0.8603773584905661,
 0.01509433962264151,
 0.007547169811320755,
 0.007547169811320755,
 0.21132075471698114,
 0.007547169811320755,
 0.033962264150943396,
 0.0037735849056603774,
 0.19622641509433963,
 0.12830188679245283,
 0.2641509433962264,
 0.06792452830188679,
 0.21132075471698114,
 0.0037735849056603774,
 0.01509433962264151,
 0.49056603773584906,
 0.007547169811320755,
 0.19622641509433963,
 0.44528301886792454,
 0.03018867924528302,
 0.045283018867924525,
 0.04150943396226415,
 1.0,
 0.8603773584905661,
 0.15471698113207547,
 0.033962264150943396,
 0.8603773584905661,
 0.03018867924528302,
 0.0037735849056603774,
 0.8603773584905661,
 0.12830188679245283,
 0.03018867924528302,
 0.17358490566037735,
 0.007547169811320755,
 0.007547169811320755,
 0.1547169811

In [7]:
test = DisjunctClassPercentage()
test.fit(X,y,max_depth=2,balanced=True)
test.calculate(X_,y_)

[0.3542600896860987,
 0.745119305856833,
 0.22321428571428573,
 0.745119305856833,
 0.745119305856833,
 0.25488069414316705,
 0.7767857142857143,
 0.25488069414316705,
 0.25488069414316705,
 0.745119305856833,
 0.745119305856833,
 0.22321428571428573,
 0.7767857142857143,
 0.25488069414316705,
 0.745119305856833,
 0.25488069414316705,
 0.25488069414316705,
 0.25488069414316705,
 0.7767857142857143,
 0.7767857142857143,
 0.745119305856833,
 0.7767857142857143,
 0.25488069414316705,
 0.7767857142857143,
 0.7767857142857143,
 0.745119305856833,
 0.25488069414316705,
 0.22321428571428573,
 0.22321428571428573,
 0.22321428571428573,
 0.22321428571428573,
 0.7767857142857143,
 0.22321428571428573,
 0.25488069414316705,
 0.25488069414316705,
 0.7767857142857143,
 0.745119305856833,
 0.3542600896860987,
 0.22321428571428573,
 0.25488069414316705,
 0.22321428571428573,
 0.22321428571428573,
 0.25488069414316705,
 0.22321428571428573,
 0.745119305856833,
 0.25488069414316705,
 0.7767857142857143