# TP3 - Discretization & Normalization

- MOULAI Mohamed Youcef, 191931089336, G4

In [4]:
import math
import pandas as pd

In [5]:
data = pd.read_excel("Dataset-Exos.xlsx")
data = data.drop(data.columns[0], axis=1)
data

Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


1. Discretization

 **Method**: Equal-width Discretization (Equal Intervals)

- Define or calculate the number of intervals, denoted as "k," to use.
- The width of each value interval is equal to: (MaxValue - MinValue) / k.
- All values falling within the same interval are represented by the same category.

In [6]:
def discretization(data):
    
    # huntsberger
    K = round(1 + (10/3) * math.log10(data.shape[0]))
    print(K)
              
    # iterating through columns
    for col in data.columns:
        k = 0
        # target column condition
        if data[col].dtype != 'float64':
            return data
        # init variables & calculating width
        intervals = []
        width = (data[col].max() - data[col].min())/K
        lower_bound = data[col].min()
        # getting all interval bounds in a list
        while lower_bound < data[col].max():
            intervals.append((lower_bound, lower_bound + width))
            lower_bound += width
        # iterating through intervals
        for interval in intervals:
            # interval mean
            interval_mean = (interval[1] + interval[0])/2
            # function that checks if the value belong to the interval or not
            def edit_col(value):
                # swapping the elt with a categorical number
                if value >= interval[0] and value < interval[1]:
                    return interval_mean
                # no editting
                else:
                    return value
            # update dataset values
            data[col] = data[col].apply(edit_col)
            # incrementing the categorical number 'k' for the next interval
            k += 1
            
    return data
        

In [7]:
dataset_discretizied = discretization(data)
dataset_discretizied

8


Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,4.975,3.65,1.36875,0.25,Iris-setosa
1,4.975,3.05,1.36875,0.25,Iris-setosa
2,4.525,3.35,1.36875,0.25,Iris-setosa
3,4.525,3.05,1.36875,0.25,Iris-setosa
4,4.975,3.65,1.36875,0.25,Iris-setosa
...,...,...,...,...,...
145,6.775,3.05,5.05625,2.35,Iris-virginica
146,6.325,2.45,5.05625,1.75,Iris-virginica
147,6.325,3.05,5.05625,2.05,Iris-virginica
148,6.325,3.35,5.05625,2.35,Iris-virginica


2. Normalization 

a. with *MinMax*

In [8]:
def normalizationMinMax(data):
    
    # new interval
    interval = (0,1)
    # iterating through columns
    for col in data.columns:
        # target column condition
        if data[col].dtype != 'float64':
            return data
        # getting Min & Max
        Min = data[col].min()
        Max = data[col].max()
        # function implementing MinMax
        def MinMax(value):
            return ((value - Min) * (interval[1] - interval[0])/(Max - Min)) + interval[0]
        # updating dataset values
        data[col] = data[col].apply(MinMax)
        
    return data

In [9]:
dataset_normalized = normalizationMinMax(data)
dataset_normalized

Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,0.142857,0.625,0.000,0.000000,Iris-setosa
1,0.142857,0.375,0.000,0.000000,Iris-setosa
2,0.000000,0.500,0.000,0.000000,Iris-setosa
3,0.000000,0.375,0.000,0.000000,Iris-setosa
4,0.142857,0.625,0.000,0.000000,Iris-setosa
...,...,...,...,...,...
145,0.714286,0.375,0.625,0.933333,Iris-virginica
146,0.571429,0.125,0.625,0.666667,Iris-virginica
147,0.571429,0.375,0.625,0.800000,Iris-virginica
148,0.571429,0.500,0.625,0.933333,Iris-virginica


In [10]:
def normalizationZScore(data):
    
    # iterating through columns
    for col in data.columns:
        # target column condition
        if data[col].dtype != 'float64':
            return data
        # getting mean & std-dev
        mean = data[col].mean()
        std = data[col].std()
        # function implementing Z-Score formula
        def ZScore(value):
            return (value - mean) / std
        # updating dataset values
        data[col] = data[col].apply(ZScore)
        
    return data

In [11]:
data_normalized = normalizationZScore(data)
data_normalized

Unnamed: 0,petal length,petal width,sepal length,sepal width,Class
0,-1.052809,1.219346,-1.320865,-1.331178,Iris-setosa
1,-1.052809,-0.110850,-1.320865,-1.331178,Iris-setosa
2,-1.604982,0.554248,-1.320865,-1.331178,Iris-setosa
3,-1.604982,-0.110850,-1.320865,-1.331178,Iris-setosa
4,-1.052809,1.219346,-1.320865,-1.331178,Iris-setosa
...,...,...,...,...,...
145,1.155882,-0.110850,0.721710,1.501116,Iris-virginica
146,0.603709,-1.441046,0.721710,0.691889,Iris-virginica
147,0.603709,-0.110850,0.721710,1.096502,Iris-virginica
148,0.603709,0.554248,0.721710,1.501116,Iris-virginica
