# Question 3

In [76]:
import pandas as pd
import numpy as np


## read data

In [77]:
df = pd.read_csv('cardio.csv', ) 

In [78]:

# Summary statistics
print(df.describe())



                 id           age        gender        height        weight  \
count  70000.000000  70000.000000  70000.000000  69904.000000  70000.000000   
mean   49972.419900  19468.865814      1.349571    164.358263     74.205690   
std    28851.302323   2467.251667      0.476838      8.211429     14.395757   
min        0.000000  10798.000000      1.000000     55.000000     10.000000   
25%    25006.750000  17664.000000      1.000000    159.000000     65.000000   
50%    50001.500000  19703.000000      1.000000    165.000000     72.000000   
75%    74889.250000  21327.000000      2.000000    170.000000     82.000000   
max    99999.000000  23713.000000      2.000000    250.000000    200.000000   

              ap_hi         ap_lo   cholesterol          gluc         smoke  \
count  70000.000000  70000.000000  70000.000000  69978.000000  69970.000000   
mean     128.817286     96.630414      1.366871      1.226500      0.088138   
std      154.011419    188.472530      0.680250    

## handle missing value

In [79]:
# Check for missing values
print(df.isnull().sum())

id               0
age              0
gender           0
height          96
weight           0
ap_hi            0
ap_lo            0
cholesterol      0
gluc            22
smoke           30
alco           140
active          14
cardio           0
dtype: int64


# replace missing values

In [80]:

for column in df.columns:
    mode = df[column].mode()[0]  # Calculate the mode for the column
    df[column].fillna(mode, inplace=True)  # Replace missing values with the mode


## convert numberic data to nominal 

In [81]:
# Define age bins and labels
age_bins = [0, 5000, 10000, 15000, 20000, 25000, float('inf')]
age_labels =  ['0-4999', '5000-9999', '10000-14999', '15000-19999', '20000-24999', '25000+']

# Cut the 'Age' column into categories
df['age'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
print(df['age'])

0        15000-19999
1        20000-24999
2        15000-19999
3        15000-19999
4        15000-19999
            ...     
69995    15000-19999
69996    20000-24999
69997    15000-19999
69998    20000-24999
69999    20000-24999
Name: age, Length: 70000, dtype: category
Categories (6, object): ['0-4999' < '5000-9999' < '10000-14999' < '15000-19999' < '20000-24999' < '25000+']


In [82]:
height_bins = [-float('inf'), 140, 160, 170, 180, float('inf')]
height_labels = ['Below 140', '140-160', '160-170', '170-180', 'Above 180']

# Discretize the 'height' column
df['height'] = pd.cut(df['height'], bins=height_bins, labels=height_labels, include_lowest=True)


print(df['height'])

0          160-170
1          140-160
2          160-170
3          160-170
4          140-160
           ...    
69995      160-170
69996      140-160
69997    Above 180
69998      160-170
69999      160-170
Name: height, Length: 70000, dtype: category
Categories (5, object): ['Below 140' < '140-160' < '160-170' < '170-180' < 'Above 180']


In [83]:


# Define custom bins and labels for weight ranges
weight_bins = [-float('inf'), 40, 60, 80, 100, float('inf')]
weight_labels = ['Missing', 'Underweight', 'Normal', 'Overweight', 'Obese']


df['weight'] = pd.cut(df['weight'], bins=weight_bins, labels=weight_labels, include_lowest=True)

# Print the DataFrame to see the changes
print(df['weight'])


0             Normal
1         Overweight
2             Normal
3         Overweight
4        Underweight
            ...     
69995         Normal
69996          Obese
69997          Obese
69998         Normal
69999         Normal
Name: weight, Length: 70000, dtype: category
Categories (5, object): ['Missing' < 'Underweight' < 'Normal' < 'Overweight' < 'Obese']


In [84]:
blood_pressure_bins = [-float('inf'), 90, 120, 140, 160, float('inf')]
blood_pressure_labels = ['Low', 'Normal', 'Elevated', 'High', 'Hypertensive']

# Discretize the 'ap_hi' column into blood pressure categories
df['ap_hi'] = pd.cut(df['ap_hi'], bins=blood_pressure_bins, labels=blood_pressure_labels, include_lowest=True)

# Print the DataFrame to see the changes
print(df['ap_hi'])

0              Normal
1            Elevated
2            Elevated
3                High
4              Normal
             ...     
69995          Normal
69996        Elevated
69997    Hypertensive
69998        Elevated
69999          Normal
Name: ap_hi, Length: 70000, dtype: category
Categories (5, object): ['Low' < 'Normal' < 'Elevated' < 'High' < 'Hypertensive']


In [85]:


# Define custom bins and labels for blood pressure levels
ap_lo_bins = [-float('inf'), 60, 69, 79, 89, 99, float('inf')]
ap_lo_labels = ['Low', 'Normal', 'Elevated', 'High Stage 1', 'High Stage 2', 'Hypertensive Crisis']


df['ap_lo'] = pd.cut(df['ap_lo'], bins=ap_lo_bins, labels=ap_lo_labels, include_lowest=True)


# Print the DataFrame to see the changes
print(df['ap_lo'])


0               High Stage 1
1               High Stage 2
2                   Elevated
3        Hypertensive Crisis
4                        Low
                ...         
69995           High Stage 1
69996           High Stage 2
69997           High Stage 2
69998           High Stage 1
69999           High Stage 1
Name: ap_lo, Length: 70000, dtype: category
Categories (6, object): ['Low' < 'Normal' < 'Elevated' < 'High Stage 1' < 'High Stage 2' < 'Hypertensive Crisis']


# remove id

In [86]:
df = df.drop(columns=['id'])

# Remove duplicate rows

In [87]:

df = df.drop_duplicates()

# prism 

In [88]:
class PRISM(object):
    def fit(self, X, Y, min_precision=0.00008, min_coverage=0.00008):
        self.X_train = X
        self.y_train = Y
        self.n_features = X.shape[1]
        self.n_samples = X.shape[0]

        print("\n::: DATASET X,Y:::")
        print(self.X_train)
        print(self.y_train)

        print("\n:::PRISM Algorithm:::")

        prism_rule_set = []
        for label in set(self.y_train):
            print("<<<<<<<<< CURRENT LABEL: " + str(label) + " >>>>>>>>>")

            instances = [i for i, val in enumerate(self.y_train) if val == label]

            while instances:
                rule = []
                X_train_ = self.X_train.copy(deep=True)
                instances_covered = []
                perfect_rule = False

                rule_precision = 0.0
                rule_coverage = 0.0

                while perfect_rule is False and len(rule) < self.n_features + 1:
                    optimal_selector = [("","")]
                    optimal_selector_prec = [0.0, 0.0, 0.0]
                    instances_covered = []

                    for attribute in X_train_.columns:
                        attr_column = X_train_.loc[:, attribute]

                        for attr_value in set(attr_column):
                            total_attr_values_instances = attr_column[attr_column == attr_value].index._values
                            total_matches = len(total_attr_values_instances)
                            print("::::TOTALS::: size = " + str(total_matches))

                            positive_attr_values_instances = list(set(total_attr_values_instances) & set(instances))
                            positive_matches = len(positive_attr_values_instances)
                            print("::::POSITIVES::: size = " + str(positive_matches))

                            precision = (1.0 * positive_matches) / total_matches
                            coverage = (1.0 * positive_matches) / self.n_samples

                            if precision > optimal_selector_prec[2]:
                                optimal_selector = (attribute, attr_value)
                                optimal_selector_prec[0] = positive_matches
                                optimal_selector_prec[1] = total_matches
                                optimal_selector_prec[2] = precision
                                rule_precision = precision
                                rule_coverage = coverage
                                instances_covered = positive_attr_values_instances

                            elif precision == optimal_selector_prec[2] and positive_matches > optimal_selector_prec[0]:
                                optimal_selector = (attribute, attr_value)
                                optimal_selector_prec[0] = positive_matches
                                optimal_selector_prec[1] = total_matches
                                optimal_selector_prec[2] = precision
                                instances_covered = positive_attr_values_instances
                                rule_precision = precision
                                rule_coverage = coverage

                    if optimal_selector_prec[2] > 0.0 and optimal_selector_prec[2] < 1.0:
                        rule.append(optimal_selector)
                        selector = rule[-1]

                        filtered_rows = X_train_[(X_train_[selector[0]] != selector[1])].index._values
                        X_train_ = X_train_.drop(filtered_rows).copy(deep=True)
                        X_train_ = X_train_.drop(selector[0], axis=1)

                        if len(X_train_.columns) == 0:
                            perfect_rule = True
                            continue

                    elif optimal_selector_prec[2] == 1.0:
                        rule.append(optimal_selector)
                        perfect_rule = True
                        continue

                    elif optimal_selector_prec[2] == 0.0:
                        print("....... UNSUAL CASE .......")

                instances = list(set(instances) - set(instances_covered))
                rule.append(label)
                rule.append([rule_precision, rule_coverage])

                print("++++++++ RULE FOUND +++++++++")
                metrics = rule[-1]
                print("Rule:")
                print(rule)
                print("Rule-Precision: " + str(metrics[0]))
                print("Rule-Coverage: " + str(metrics[1]))
                print("\n")

                prism_rule_set.append(rule)

                # Check if the rule meets early stopping criteria
                rule_precision, rule_coverage = rule[-1]
                if rule_precision >= min_precision and rule_coverage >= min_coverage:
                    break

        return prism_rule_set
    
    def predict(self, test_data, rule_set):
        most_frequent_label = self.find_most_frequent_label()
        predictions = []

        for _, row in test_data.iterrows():
            instance_predictions = []

            for rule in rule_set:
                label = rule[-2]
                selectors = rule[:-2]

                # Initialize a flag to check if all selectors in the rule match the instance
                rule_matches = True

                for selector in selectors:
                    attribute, attr_value = selector
                    if row[attribute] != attr_value:
                        rule_matches = False
                        break

                if rule_matches:
                    instance_predictions.append(label)

            if instance_predictions:
                
                if isinstance(instance_predictions[0], (list, tuple)):
                    # If labels are in the format [precision, coverage],
                    # find the label with the highest precision and coverage
                    best_label = max(instance_predictions, key=lambda label: label[0])
                else:
                    # If labels are integers, just use the majority label
                    best_label = max(set(instance_predictions), key=instance_predictions.count)

                predictions.append(best_label)
            else:
                # If no rule matches, assign a default label or handle as needed
                predictions.append(most_frequent_label)

        return predictions
    def find_most_frequent_label(self):
    # Determine the most frequent label from your training data
     from collections import Counter
     label_counts = Counter(self.y_train)
     most_common_label = label_counts.most_common(1)[0][0]
     return most_common_label














# K fold

In [89]:


def k_fold_cross_validation(model, data, labels, k):
    num_samples = len(data)
    fold_size = num_samples // k

    # Initialize lists to store evaluation metrics
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for i in range(k):
        # Define the indices for the current fold
        start = i * fold_size
        end = (i + 1) * fold_size
        test_indices = list(range(start, end))

        # Select the test set for the current fold
        test_data = data.iloc[test_indices]
        test_labels = labels.iloc[test_indices]

        # Select the training set for the current fold
        train_indices = list(range(0, start)) + list(range(end, num_samples))
        train_data = data.iloc[train_indices]
        train_labels = labels.iloc[train_indices]

        # Train the model on the training data
        model.fit(train_data, train_labels)

        # Make predictions on the test data
        y_pred = model.predict(test_data)

        # Calculate evaluation metrics for this fold
        accuracy = np.mean(y_pred == test_labels)
        if np.sum(y_pred ==1) == 0:
            precision = 0.0  # Handle the case when there are no positive predictions
            recall = 0.0
            f1 = 0.0
        else:
            precision = np.mean(y_pred[y_pred ==1] == test_labels[y_pred == 1])
            recall = np.mean(y_pred[y_pred == 1] == test_labels[y_pred == 1])
            f1 = 2 * (precision * recall) / (precision + recall)

        # Append the scores to the respective lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Return the mean scores across all folds
    return {
        "Accuracys": accuracy_scores,
        "Precisions": precision_scores,
        "Accuracy mean": np.mean(accuracy_scores),
        "Precision mean": np.mean(precision_scores),
        "Recall ": np.mean(recall_scores),
        "F1 Score ": np.mean(f1_scores)
    }


In [90]:
rule = PRISM()
data =  df.drop(['cardio'], axis=1)
labels = df['cardio']

k_values = [3, 5, 7]

for k in k_values:
    result = k_fold_cross_validation(rule, data, labels, k)
    print("\n\n")
    print(f"Results for one R  with k={k}: \n")
    for metric, value in result.items():
        print(f"{metric}: {value}")






::: DATASET X,Y:::
               age  gender     height      weight         ap_hi         ap_lo  \
11866  20000-24999       1  Above 180  Overweight        Normal  High Stage 1   
11868  15000-19999       2    140-160      Normal        Normal  High Stage 1   
11870  15000-19999       1    170-180      Normal      Elevated  High Stage 2   
11872  15000-19999       2    170-180  Overweight      Elevated  High Stage 2   
11873  20000-24999       1    160-170      Normal  Hypertensive           Low   
...            ...     ...        ...         ...           ...           ...   
69918  20000-24999       2    140-160      Normal      Elevated  High Stage 2   
69949  20000-24999       1    160-170      Normal  Hypertensive  High Stage 1   
69970  20000-24999       2    170-180       Obese      Elevated  High Stage 1   
69985  15000-19999       1    140-160       Obese      Elevated  High Stage 1   
69997  15000-19999       2  Above 180       Obese  Hypertensive  High Stage 2   

       

::::TOTALS::: size = 5184
::::POSITIVES::: size = 0
::::TOTALS::: size = 1865
::::POSITIVES::: size = 0
::::TOTALS::: size = 1617
::::POSITIVES::: size = 0
::::TOTALS::: size = 6544
::::POSITIVES::: size = 0
::::TOTALS::: size = 2122
::::POSITIVES::: size = 0
::::TOTALS::: size = 6956
::::POSITIVES::: size = 0
::::TOTALS::: size = 1710
::::POSITIVES::: size = 0
::::TOTALS::: size = 2788
::::POSITIVES::: size = 0
::::TOTALS::: size = 5878
::::POSITIVES::: size = 0
....... UNSUAL CASE .......
::::TOTALS::: size = 4040
::::POSITIVES::: size = 0
::::TOTALS::: size = 779
::::POSITIVES::: size = 0
::::TOTALS::: size = 3847
::::POSITIVES::: size = 0
::::TOTALS::: size = 4195
::::POSITIVES::: size = 0
::::TOTALS::: size = 4471
::::POSITIVES::: size = 0
::::TOTALS::: size = 3315
::::POSITIVES::: size = 0
::::TOTALS::: size = 2419
::::POSITIVES::: size = 0
::::TOTALS::: size = 2188
::::POSITIVES::: size = 0
::::TOTALS::: size = 610
::::POSITIVES::: size = 0
::::TOTALS::: size = 134
::::POSITIVES

KeyboardInterrupt: 