In [65]:
import pandas as pd
import plotly.express as px

In [66]:
dataset = pd.read_csv("iris_dataset.csv")

In [67]:
# Inspect Dataset
dataset.head()

px.scatter_3d(dataset, 
              x='SepalLengthCm',
              y='SepalWidthCm',
              z='PetalLengthCm',
              color='Species').show()

In [68]:
# Data Discretization

def determine_equal_width_boundaries(num_intervals: int, samples: pd.Series) -> tuple:

    x_min = min(samples)
    x_max = max(samples)

    width = (x_max - x_min) / num_intervals

    boundaries = (x_min, x_min + width)

    for _ in range(num_intervals - 1):
        boundaries = boundaries + (boundaries[-1] + width,)

    return boundaries

def determine_equal_frequency_boundaries(num_intervals: int, samples: pd.Series) -> tuple:

    num_elements = len(samples)
    x_min = min(samples)
    x_max = max(samples)

    interval_size = round(num_elements/num_intervals)

    sorted_samples = samples.sort_values(ignore_index=True)

    boundaries = (x_min,)

    current_index = 0

    for _ in range(num_intervals-1):
        current_index += interval_size
        boundaries += (sorted_samples[current_index],)

    boundaries += (x_max,)

    return boundaries


def discretize(samples: pd.Series, boundaries: tuple, labels: tuple) -> pd.Series:

    assert len(boundaries) == len(labels) + 1

    index_list = []
    discrete_value_list = []

    for index, value in samples.items():

        index_list.append(index)

        for index_boundary, value_boundary in enumerate(boundaries):
            if value <= value_boundary:
                label_index = index_boundary-1 if index_boundary-1 > 0 else 0
                discrete_value_list.append(labels[label_index])
                break

    new_series = pd.Series(discrete_value_list, index_list)

    return new_series

def count_occurances(samples):
    return samples.value_counts()


In [77]:
samples = dataset.sample(150, random_state=1)["SepalWidthCm"]

equal_width_boundaries = determine_equal_width_boundaries(3, samples)

discrete_samples = discretize(samples, equal_width_boundaries, ("small", "medium", "long"))

In [78]:
fig = px.histogram(samples, 
                   x="SepalWidthCm",
                   nbins=30,
                   title="Sample distribution based on Septal Width (Red vertical define boundaries)")

fig['data'][0]['name'] = 'ROC'

for boundary in equal_width_boundaries:
    fig.add_vline(x=boundary, 
                  y1=40, 
                  line_dash="dot", 
                  line_color = 'red')
fig.show()

fig = px.histogram(discrete_samples, 
                   x=0,
                   title="Sample distribution given labels")
fig.show()

count_occurances(discrete_samples)

medium    88
small     47
long      15
dtype: int64

In [75]:
equal_frequency_boundaries = determine_equal_frequency_boundaries(3, samples)

discrete_samples = discretize(samples, equal_frequency_boundaries, ("small", "medium", "long"))

In [76]:
fig = px.histogram(samples, 
                   x="SepalWidthCm",
                   nbins=30,
                   title="Sample distribution based on Septal Width (Red vertical define boundaries)")

fig['data'][0]['name'] = 'ROC'

for boundary in equal_frequency_boundaries:
    fig.add_vline(x=boundary, 
                  y1=40, 
                  line_dash="dot", 
                  line_color = 'red')
fig.show()

fig = px.histogram(discrete_samples, 
                   x=0,
                   title="Sample distribution given labels")
fig.show()

count_occurances(discrete_samples)

small     57
medium    51
long      42
dtype: int64

In [79]:
def expand_database(series, column_name_prefix = ""):
    
    data = {}

    columns = series.unique()
    for column in columns:
        column_name = f"{column_name_prefix}{column}"
        data[column_name] = []

    index_column = []

    for index, value in series.items():

        index_column.append(index)

        for column in columns:
            column_name = f"{column_name_prefix}{column}"
            if column == value:
                data[column_name].append(True)
            else:
                data[column_name].append(False)

    return pd.DataFrame(data=data, index=index_column)

In [80]:
expand_database(discrete_samples, "SepalWidthCm_")

Unnamed: 0,SepalWidthCm_long,SepalWidthCm_small,SepalWidthCm_medium
14,True,False,False
98,False,True,False
75,False,False,True
16,True,False,False
131,True,False,False
...,...,...,...
133,False,True,False
137,False,False,True
72,False,True,False
140,False,False,True


In [82]:
labels = ("small", "medium", "long")

samples_SepalLengthCm = dataset.sample(150, random_state=1)["SepalLengthCm"]
samples_SepalWidthCm = dataset.sample(150, random_state=1)["SepalWidthCm"]
samples_PetalLengthCm = dataset.sample(150, random_state=1)["PetalLengthCm"]
samples_PetalWidthCm = dataset.sample(150, random_state=1)["PetalWidthCm"]

boundaries_SepalLengthCm = determine_equal_frequency_boundaries(3, samples_SepalLengthCm)
boundaries_SepalWidthCm = determine_equal_frequency_boundaries(3, samples_SepalWidthCm)
boundaries_PetalLengthCm = determine_equal_frequency_boundaries(3, samples_PetalLengthCm)
boundaries_PetalWidthCm = determine_equal_frequency_boundaries(3, samples_PetalWidthCm)

discrete_samples_SepalLengthCm = discretize(samples_SepalLengthCm, boundaries_SepalLengthCm, labels)
discrete_samples_SepalWidthCm = discretize(samples_SepalWidthCm, boundaries_SepalWidthCm, labels)
discrete_samples_PetalLengthCm = discretize(samples_PetalLengthCm, boundaries_PetalLengthCm, labels)
discrete_samples_PetalWidthCm = discretize(samples_PetalWidthCm, boundaries_PetalWidthCm, labels)

database_SepalLengthCm = expand_database(discrete_samples_SepalLengthCm, "SepalLengthCm_")
database_SepalWidthCm = expand_database(discrete_samples_SepalWidthCm, "SepalWidthCm_")
database_PetalLengthCm = expand_database(discrete_samples_PetalLengthCm, "PetalLengthCm_")
database_PetalWidthCm = expand_database(discrete_samples_PetalWidthCm, "PetalWidthCm_")

database = database_SepalLengthCm.join(database_SepalWidthCm).join(database_PetalLengthCm).join(database_PetalWidthCm)

     SepalLengthCm_medium  SepalLengthCm_small  SepalLengthCm_long  \
14                   True                False               False   
98                  False                 True               False   
75                  False                False                True   
16                  False                 True               False   
131                 False                False                True   
..                    ...                  ...                 ...   
133                  True                False               False   
137                 False                False                True   
72                   True                False               False   
140                 False                False                True   
37                  False                 True               False   

     SepalWidthCm_long  SepalWidthCm_small  SepalWidthCm_medium  \
14                True               False                False   
98               False   

In [83]:
database

Unnamed: 0,SepalLengthCm_medium,SepalLengthCm_small,SepalLengthCm_long,SepalWidthCm_long,SepalWidthCm_small,SepalWidthCm_medium,PetalLengthCm_small,PetalLengthCm_medium,PetalLengthCm_long,PetalWidthCm_small,PetalWidthCm_medium,PetalWidthCm_long
14,True,False,False,True,False,False,True,False,False,True,False,False
98,False,True,False,False,True,False,True,False,False,False,True,False
75,False,False,True,False,False,True,False,True,False,False,True,False
16,False,True,False,True,False,False,True,False,False,True,False,False
131,False,False,True,True,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
133,True,False,False,False,True,False,False,False,True,False,True,False
137,False,False,True,False,False,True,False,False,True,False,False,True
72,True,False,False,False,True,False,False,True,False,False,True,False
140,False,False,True,False,False,True,False,False,True,False,False,True
