In [52]:
# Methods to be used repeatedly
def Verify(expression: bool, message: str):
    if not expression:
        raise Exception(message)
    else:
        return
    
def report_missing_features(X):
    report = X.isna().sum(axis=1) 
    report = report[report > 0]
    print("Entry index | Number of missing features")
    print(report)



In [53]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np


### step 1 is to obtain the data and remove rows with missing features, we will store these in two objects of type dataset
class dataset:
    def __init__(self, features : np.ndarray, targets : np.ndarray, var_info : np.ndarray):
        self.features = features
        self.targets = targets
        self.var_info = var_info

# fetch dataset for age predictions
national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset = fetch_ucirepo(id=887) 

X = national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset.data.features
missing_values = X.isna().sum().sum()
if (missing_values>1):
    print("Report of missing features for age prediction:")
    report_missing_features(X)
print("Number of missing values in the age prediction dataset: " + str(missing_values))
nan_indices = X[X.isna().any(axis=1)].index
X = X.dropna().to_numpy()

y = national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset.data.targets
y = y.drop(index=nan_indices)
y = y.to_numpy()
if not np.all((y == "Adult") | (y == "Senior")):
    raise ValueError("Array contains an entry that is not 'Adult' or 'Senior'")
y = np.where(y == "Senior", 1, 0)

Verify(len(X) == len(y), "Features and targets are different lengths.")
var_info = np.array(["Respondent Gender", "Activity", "BMI", "Blood Glucose", "Diabetic", "Oral", "Blood Insulin"])
ageDataset = dataset(X, y, var_info)






# fetch dataset for breast cancer detection
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)  

X = breast_cancer_wisconsin_original.data.features
missing_values = X.isna().sum().sum()
if (missing_values>1):
    print("Report of missing features for breast cancer detection:")
    report_missing_features(X)
print("Number of missing values in the breast cancer prediction dataset: " + str(missing_values))
nan_indices = X[X.isna().any(axis=1)].index
X = X.dropna().to_numpy()

y = breast_cancer_wisconsin_original.data.targets
y = y.drop(index=nan_indices)
y = y.to_numpy()
if not np.all((y == 4) | (y == 2)):
    raise ValueError("Array contains an entry that is not 4 or 2")
y = np.where(y == 4, 1, 0)


Verify(len(X) == len(y), "Features and targets are different lengths.")
var_info = np.array(["Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses"])
breastDataset = dataset(X,y,var_info)

print("\n\n Datasets cleaned.")

Number of missing values in the age prediction dataset: 0
Report of missing features for breast cancer detection:
Entry index | Number of missing features
23     1
40     1
139    1
145    1
158    1
164    1
235    1
249    1
275    1
292    1
294    1
297    1
315    1
321    1
411    1
617    1
dtype: int64
Number of missing values in the breast cancer prediction dataset: 16


 Datasets cleaned.


In [59]:
### Step 2 is to find the means in the positive and negative set for each features
### We'll do this by going back to pandas
import matplotlib.pyplot as plt

# first for the age dataset
targets = ageDataset.targets.squeeze()
x = ageDataset.features.copy()
positive_x = x[targets == 1]
negative_x = x[targets == 0]
means_positive = np.mean(positive_x, axis=0)
means_negative = np.mean(negative_x, axis=0)
squared_difference = np.power(means_positive-means_negative, 2)

# tabling
mean_values_df = pd.DataFrame({
    'Mean Positive': means_positive,
    'Mean Negative': means_negative,
    'Squared Difference': squared_difference
}, index=ageDataset.var_info)
print(mean_values_df)


#second for the breast dataset
targets = breastDataset.targets.squeeze()
x = breastDataset.features.copy()
positive_x = x[targets == 1]
negative_x = x[targets == 0]
means_positive = np.mean(positive_x, axis=0)
means_negative = np.mean(negative_x, axis=0)
squared_difference = np.power(means_positive-means_negative, 2)

# tabling
mean_values_df = pd.DataFrame({
    'Mean Positive': means_positive,
    'Mean Negative': means_negative,
    'Squared Difference': squared_difference
}, index=breastDataset.var_info)
print(mean_values_df)





                   Mean Positive  Mean Negative  Squared Difference
Respondent Gender       1.508242       1.512017            0.000014
Activity                1.909341       1.806165            0.010645
BMI                    27.886264      27.968286            0.006728
Blood Glucose         104.329670      98.644723           32.318625
Diabetic                2.027473       2.014107            0.000179
Oral                  141.208791     109.990596          974.575736
Blood Insulin          10.405247      12.106661            2.894810


IndexError: boolean index did not match indexed array along dimension 0; dimension is 683 but corresponding boolean dimension is 2278