# TP3 - Discretization & Normalization

- MOULAI Mohamed Youcef, 191931089336, G4

In [104]:
import math
import pandas as pd

In [105]:
data = pd.read_excel("Dataset-Exos.xlsx")
data = data.drop(data.columns[0], axis=1)
data

Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


1. Discretization

 **Method**: Equal-width Discretization (Equal Intervals)

- Define or calculate the number of intervals, denoted as "k," to use.
- The width of each value interval is equal to: (MaxValue - MinValue) / k.
- All values falling within the same interval are represented by the same category.

In [100]:
def discretization(data):
    
    # huntsberger
    K = round(1 + (3/10) * math.log10(data.shape[0]))
              
    # iterating through columns
    for col in data.columns:
        k = 0
        # target column condition
        if data[col].dtype != 'float64':
            return data
        # init variables & calculating width
        intervals = []
        width = (data[col].max() - data[col].min())/K
        lower_bound = data[col].min()
        # getting all interval bounds in a list
        while lower_bound < data[col].max():
            intervals.append((lower_bound, lower_bound + width))
            lower_bound += width
        # iterating through intervals
        for interval in intervals:
            # interval mean
            interval_mean = (interval[1] + interval[0])/2
            # function that checks if the value belong to the interval or not
            def edit_col(value):
                # swapping the elt with a categorical number
                if value >= interval[0] and value < interval[1]:
                    return interval_mean
                # no editting
                else:
                    return value
            # update dataset values
            data[col] = data[col].apply(edit_col)
            # incrementing the categorical number 'k' for the next interval
            k += 1
            
    return data
        

In [101]:
dataset_discretizied = discretization(data)
dataset_discretizied

Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,5.2,3.8,2.475,0.7,Iris-setosa
1,5.2,2.6,2.475,0.7,Iris-setosa
2,5.2,3.8,2.475,0.7,Iris-setosa
3,5.2,2.6,2.475,0.7,Iris-setosa
4,5.2,3.8,2.475,0.7,Iris-setosa
...,...,...,...,...,...
145,7.0,2.6,5.425,1.9,Iris-virginica
146,7.0,2.6,5.425,1.9,Iris-virginica
147,7.0,2.6,5.425,1.9,Iris-virginica
148,7.0,3.8,5.425,1.9,Iris-virginica


2. Normalization (with *MinMax*)

In [102]:
def normalization(data):
    
    # new interval
    interval = (0,1)
    # iterating through columns
    for col in data.columns:
        # target column condition
        if data[col].dtype != 'float64':
            return data
        # getting Min & Max
        Min = data[col].min()
        Max = data[col].max()
        # function implementing MinMax
        def MinMax(value):
            return ((value - Min) * (interval[1] - interval[0])/(Max - Min)) + interval[0]
        # updating dataset values
        data[col] = data[col].apply(MinMax)
        
    return data

In [106]:
dataset_normalized = normalization(data)
dataset_normalized

Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,0.222222,0.625000,0.067797,0.041667,Iris-setosa
1,0.166667,0.416667,0.067797,0.041667,Iris-setosa
2,0.111111,0.500000,0.050847,0.041667,Iris-setosa
3,0.083333,0.458333,0.084746,0.041667,Iris-setosa
4,0.194444,0.666667,0.067797,0.041667,Iris-setosa
...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,Iris-virginica
146,0.555556,0.208333,0.677966,0.750000,Iris-virginica
147,0.611111,0.416667,0.711864,0.791667,Iris-virginica
148,0.527778,0.583333,0.745763,0.916667,Iris-virginica
