In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the spambase dataset
spambase = pd.read_csv('data/spambase.csv')

# Separate the dataset into features (X) and labels (y)
X = spambase.iloc[:, :-1]
y = spambase.iloc[:, -1]

X_pos = spambase[spambase['spam'] == 1]

# Split the dataset into training and testing sets
X_train,_, y_train, _ =train_test_split(X_pos.iloc[:, :-1], X_pos.iloc[:, -1], test_size=0.4, random_state=0)
_,X_test,_,y_test= train_test_split(X,y, test_size=0.4, random_state=0)

# Discretize the training set using equal-width binning
num_bins = 10
bin_width = np.ptp(X_train, axis=0) / num_bins
X_train_discrete = (X_train / bin_width).apply(np.floor).astype(int)

# Discretize the testing set using the extracted ranges from the training set
X_test_discrete = (X_test / bin_width).apply(np.floor).astype(int)

# Initialize the specific hypothesis to the first row of the training set
specific_hypo = X_train_discrete[0:1]

# Iterate over the training examples
for i in range(1, len(X_train_discrete)):
  # Iterate over the attributes
  for j in range(len(specific_hypo.columns)):
    # If the attribute value does not match the corresponding value in the training example, set it to 0
    if specific_hypo.iloc[0, j] != X_train_discrete.iloc[i, j]:
       specific_hypo.iloc[0, j] = 0

# Calculate the accuracy of the specific hypothesis on the testing set
num_correct = 0
for i in range(len(X_test_discrete)):
  # Iterate over the attributes
  for j in range(len(specific_hypo.columns)):
    # If the attribute value does not match the corresponding value in the testing example, increment the false example count
    if specific_hypo.iloc[0, j] != X_test_discrete.iloc[i, j]:
      num_correct += 1
      break



# Calculate the bounds for the class attributes. 
#It converts the discrete values of the specific hypothesis back to the original continuous values,
#and thus obtain the specific bounds for the class attributes.
bounds = specific_hypo * bin_width



print("Specific hypothesis:")
for i,col in enumerate(specific_hypo.columns):
    print(f'{col}: {specific_hypo.iloc[0, i]}')

print("Specific bounds:")
for i,col in enumerate(specific_hypo.columns):
    print(f'{col}: {bounds.iloc[0, i]}')



Specific hypothesis:
Word_freq_make: 0
 Word_freq_address: 0
 Word_freq_all: 0
 Word_freq_3d: 0
 Word_freq_our: 0
 Word_freq_over: 0
 Word_freq_remove: 0
 Word_freq_internet: 0
 Word_freq_order: 0
 Word_freq_mail: 0
 Word_freq_receive: 0
 Word_freq_will: 0
 Word_freq_people: 0
 Word_freq_report: 0
 Word_freq_addresses: 0
 Word_freq_free: 0
 Word_freq_business: 0
 Word_freq_email: 0
 Word_freq_you: 0
 Word_freq_credit: 0
 Word_freq_your: 0
 Word_freq_font: 0
 Word_freq_000: 0
 Word_freq_money: 0
 Word_freq_hp: 0
 Word_freq_hpl: 0
 Word_freq_george: 0
 Word_freq_650: 0
 Word_freq_lab: 0
 Word_freq_labs: 0
 Word_freq_telnet: 0
 Word_freq_857: 0
 Word_freq_data: 0
 Word_freq_415: 0
 Word_freq_85: 0
 Word_freq_technology: 0
 Word_freq_1999: 0
 Word_freq_parts: 0
 Word_freq_pm: 0
 Word_freq_direct: 0
 Word_freq_cs: 0
 Word_freq_meeting: 0
 Word_freq_original: 0
 Word_freq_project: 0
 Word_freq_re: 0
 Word_freq_edu: 0
 Word_freq_table: 0
 Word_freq_conference: 0
 Char_freq1: 0
 Char_freq2: 0


In [13]:
accuracy = num_correct / len(X_test_discrete)
# Print the results
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.97


Accuracy: 0.97
