# Discretization of pre-processed data
## Dataset: austratlia
By: Sam
Replicate using Malina script
Update: 13/10/22

### About Dataset
*Continuous attributes:*

- A2:	continuous.
- A3:	continuous.
- A7:	continuous.
- A10: continuous.
- A13: continuous.
- A14: continuous.

*Categorical attributes:*
- A1:	0,1    CATEGORICAL a,b
- A4:	1,2,3         CATEGORICAL p,g,gg
- A5:  1, 2,3,4,5, 6,7,8,9,10,11,12,13,14    CATEGORICAL ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x 
- A6:	 1, 2,3, 4,5,6,7,8,9    CATEGORICAL ff,dd,j,bb,v,n,o,h,z 
- A8:	1, 0       CATEGORICAL t, f.
- A9: 1, 0	    CATEGORICAL t, f.
- A11:  1, 0	    CATEGORICAL t, f.
- A12:    1, 2, 3    CATEGORICAL s, g, p 

*Label*
A15:   1,2 +,- (class attribute)

# 1. Preparing data

In [10]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [11]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_australia.csv')
#australia dataset
aus = data0

In [12]:
aus

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,label
0,1,22.08,11.460,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.000,2,8,4,0.165,0,0,0,0,2,160,1,0
2,0,29.58,1.750,1,4,4,1.250,0,0,0,1,2,280,1,0
3,0,21.67,11.500,1,5,3,0.000,1,1,11,1,2,0,1,1
4,1,20.17,8.170,2,6,4,1.960,1,1,14,0,2,60,159,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,31.57,10.500,2,14,4,6.500,1,0,0,0,2,0,1,1
686,1,20.67,0.415,2,8,4,0.125,0,0,0,0,2,0,45,0
687,0,18.83,9.540,2,6,4,0.085,1,0,0,0,2,100,1,1
688,0,27.42,14.500,2,14,8,3.085,1,1,1,0,2,120,12,1


In [13]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
aus['label']= label_encoder.fit_transform(aus['label'])
  
aus['label'].unique()

array([0, 1], dtype=int64)

In [14]:
# List of continuous feature to discretize
num_list = ["A2", "A3", "A7", "A10", "A13", "A14"]

In [15]:
y_list = pd.DataFrame(aus['label'])

In [16]:
num_list
y_list

Unnamed: 0,label
0,0
1,0
2,0
3,1
4,1
...,...
685,1
686,0
687,1
688,1


# 2. Chi Merge  discretization implementation 1 based on Kerber 1992
Source: https://gist.github.com/alanzchen/17d0c4a45d59b79052b1cd07f531689e

In [17]:
#Define function for chiMerge
def chimerge(data, attr, label, max_intervals):
    distinct_vals = sorted(set(data[attr])) # Sort the distinct values
    labels = sorted(set(data[label])) # Get all possible labels
    empty_count = {l: 0 for l in labels} # A helper function for padding the Counter()
    intervals = [[distinct_vals[i], distinct_vals[i]] for i in range(len(distinct_vals))] # Initialize the intervals for each attribute
    while len(intervals) > max_intervals: # While loop
        chi = []
        for i in range(len(intervals)-1):
            # Calculate the Chi2 value
            obs0 = data[data[attr].between(intervals[i][0], intervals[i][1])]
            obs1 = data[data[attr].between(intervals[i+1][0], intervals[i+1][1])]
            total = len(obs0) + len(obs1)
            count_0 = np.array([v for i, v in {**empty_count, **Counter(obs0[label])}.items()])
            count_1 = np.array([v for i, v in {**empty_count, **Counter(obs1[label])}.items()])
            count_total = count_0 + count_1
            expected_0 = count_total*sum(count_0)/total
            expected_1 = count_total*sum(count_1)/total
            chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1
            chi_ = np.nan_to_num(chi_) # Deal with the zero counts
            chi.append(sum(chi_)) # Finally do the summation for Chi2
        min_chi = min(chi) # Find the minimal Chi2 for current iteration
        for i, v in enumerate(chi):
            if v == min_chi:
                min_chi_index = i # Find the index of the interval to be merged
                break
        new_intervals = [] # Prepare for the merged new data array
        skip = False
        done = False
        for i in range(len(intervals)):
            if skip:
                skip = False
                continue
            if i == min_chi_index and not done: # Merge the intervals
                t = intervals[i] + intervals[i+1]
                new_intervals.append([min(t), max(t)])
                skip = True
                done = True
            else:
                new_intervals.append(intervals[i])
        intervals = new_intervals
    for i in intervals:
        print('[', i[0], ',', i[1], ']', sep='')
    return intervals
        

In [18]:
# REVISED FUNCTION: Mapping interval to dataframe 
def chi_map_2 (attr, data):
    disc_list = []
    for j in range(0,len(data)):
        #print('Raw value:', data[attr].loc[j])
        for i in range (0,len(chi_intervals[attr])):      
            if int(data[attr].loc[j]) in range(int(chi_intervals[attr][i][0]),int(chi_intervals[attr][i][1])):
                disc_list.append(i)
                #print('label',i)
            elif int(data[attr].loc[j]) in chi_intervals[attr][i]: # In case upper and lower are equal
                disc_list.append(i)
                #print('label',i)
    return disc_list

In [19]:
# DEBUG CHIMAP FUNCTION: Mapping interval to dataframe 
def chi_map (attr, data):
    disc_list = []
    for j in range(0,len(data)):
        #print('Raw value:', data[attr].loc[j])
        for i in range (0,len(chi_intervals[attr])):      
            if (chi_intervals[attr][i][0] <= data[attr].loc[j]) and (data[attr].loc[j] <= chi_intervals[attr][i][1]):
                disc_list.append(i)
    return disc_list

In [20]:
# #Solution 1: Convert range to int, pass input as int
# australia['A2']
# chi_intervals['A2']
# range(int(chi_intervals['A2'][0][0]),int(chi_intervals['A2'][0][1]))
# int(australia['A2'][0]) in range(int(chi_intervals['A2'][0][0]),int(chi_intervals['A2'][0][1]))


## 2.1 Chi merge with 6 intervals

In [24]:
#Chi Merge with 6 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=aus, attr=i, label='label', max_intervals=6)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created 
aus_disc_6intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, aus) 
    print(v)
    aus_disc_6intervals[k]=v



  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[13.75,23.0]
[23.08,23.33]
[23.42,40.92]
[41.0,51.58]
[51.83,52.42]
[52.5,80.25]
[0.0,1.125]
[1.165,1.375]
[1.415,4.165]
[4.25,6.75]
[7.0,7.25]
[7.5,28.0]
[0.0,0.585]
[0.625,1.375]
[1.415,1.46]
[1.5,4.0]
[4.165,13.875]
[14.0,28.5]
[0,0]
[1,2]
[3,4]
[5,9]
[10,10]
[11,67]
[0,102]
[108,369]
[370,399]
[400,480]
[487,520]
[523,2000]
[1,1]
[2,6]
[7,287]
[301,328]
[341,365]
[368,100001]
A2
6
A3
6
A7
6
A10
6
A13
6
A14
6
A2
[0, 0, 2, 0, 0, 0, 0, 5, 2, 5, 2, 3, 0, 2, 5, 3, 2, 0, 0, 0, 2, 0, 3, 3, 0, 2, 0, 2, 2, 1, 2, 0, 4, 1, 3, 5, 2, 2, 3, 3, 1, 0, 2, 5, 2, 0, 2, 0, 2, 5, 2, 0, 2, 0, 2, 2, 2, 2, 3, 5, 3, 2, 0, 2, 2, 3, 0, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 3, 2, 0, 2, 3, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 3, 2, 2, 3, 2, 2, 2, 2, 0, 2, 0, 3, 2, 0, 2, 5, 2, 0, 5, 2, 3, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 3, 0, 0, 0, 5, 0, 5, 5, 0, 2, 3, 1, 3, 0, 0, 5, 0, 0, 2, 0, 3, 2, 2, 3, 2, 3, 0, 1, 0, 0, 0, 2, 0, 0, 5, 2, 2, 5, 5, 3, 3, 3, 2, 2, 2, 1, 2, 2, 2, 2, 0, 0, 2, 2, 2

In [25]:
# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
aus_disc_6intervals = pd.DataFrame.from_dict(aus_disc_6intervals)

aus_disc_6intervals.head()

end = time.time()
print(end - start) # Total time execution for this sample

97.17157411575317


In [26]:
tmp = pd.concat([aus_disc_6intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_aus_6int.csv',index=False)

In [27]:
tmp

Unnamed: 0,A2,A3,A7,A10,A13,A14,label
0,0,5,3,0,0,5,0
1,0,4,0,0,1,0,0
2,2,2,1,0,1,0,0
3,0,5,0,5,0,0,1
4,0,5,3,5,0,2,1
...,...,...,...,...,...,...,...
685,2,5,4,0,0,0,1
686,0,0,0,0,0,2,0
687,0,5,0,0,0,0,1
688,2,5,3,1,1,2,1


In [28]:
#Chi Merge with 6 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=aus, attr=i, label='label', max_intervals=6)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created
aus_disc_6intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, aus)
    print(v)
    aus_disc_6intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
aus_disc_6intervals = pd.DataFrame.from_dict(aus_disc_6intervals)

aus_disc_6intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[13.75,23.0]
[23.08,23.33]
[23.42,40.92]
[41.0,51.58]
[51.83,52.42]
[52.5,80.25]
[0.0,1.125]
[1.165,1.375]
[1.415,4.165]
[4.25,6.75]
[7.0,7.25]
[7.5,28.0]
[0.0,0.585]
[0.625,1.375]
[1.415,1.46]
[1.5,4.0]
[4.165,13.875]
[14.0,28.5]
[0,0]
[1,2]
[3,4]
[5,9]
[10,10]
[11,67]
[0,102]
[108,369]
[370,399]
[400,480]
[487,520]
[523,2000]
[1,1]
[2,6]
[7,287]
[301,328]
[341,365]
[368,100001]
A2
6
A3
6
A7
6
A10
6
A13
6
A14
6
A2
[0, 0, 2, 0, 0, 0, 0, 5, 2, 5, 2, 3, 0, 2, 5, 3, 2, 0, 0, 0, 2, 0, 3, 3, 0, 2, 0, 2, 2, 1, 2, 0, 4, 1, 3, 5, 2, 2, 3, 3, 1, 0, 2, 5, 2, 0, 2, 0, 2, 5, 2, 0, 2, 0, 2, 2, 2, 2, 3, 5, 3, 2, 0, 2, 2, 3, 0, 0, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 3, 2, 0, 2, 3, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 3, 2, 2, 3, 2, 2, 2, 2, 0, 2, 0, 3, 2, 0, 2, 5, 2, 0, 5, 2, 3, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 3, 0, 0, 0, 5, 0, 5, 5, 0, 2, 3, 1, 3, 0, 0, 5, 0, 0, 2, 0, 3, 2, 2, 3, 2, 3, 0, 1, 0, 0, 0, 2, 0, 0, 5, 2, 2, 5, 5, 3, 3, 3, 2, 2, 2, 1, 2, 2, 2, 2, 0, 0, 2, 2, 2

## 2.2 ChiMerge with 8 Intervals

In [29]:
#Chi Merge with 8 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=aus, attr=i, label='label', max_intervals=8)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created
aus_disc_8intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, aus)
    print(v)
    aus_disc_8intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
aus_disc_8intervals = pd.DataFrame.from_dict(aus_disc_8intervals)

aus_disc_8intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[13.75,20.08]
[20.17,21.0]
[21.08,23.0]
[23.08,23.33]
[23.42,40.92]
[41.0,51.58]
[51.83,52.42]
[52.5,80.25]
[0.0,1.125]
[1.165,1.375]
[1.415,4.165]
[4.25,6.75]
[7.0,7.25]
[7.5,19.0]
[19.5,25.125]
[25.21,28.0]
[0.0,0.455]
[0.46,0.46]
[0.5,0.585]
[0.625,1.375]
[1.415,1.46]
[1.5,4.0]
[4.165,13.875]
[14.0,28.5]
[0,0]
[1,2]
[3,4]
[5,7]
[8,9]
[10,10]
[11,19]
[20,67]
[0,102]
[108,168]
[170,180]
[181,369]
[370,399]
[400,480]
[487,520]
[523,2000]
[1,1]
[2,6]
[7,99]
[100,287]
[301,328]
[341,365]
[368,5553]
[5778,100001]
A2
8
A3
8
A7
8
A10
8
A13
8
A14
8
A2
[2, 2, 4, 2, 1, 0, 0, 7, 4, 7, 4, 5, 1, 4, 7, 5, 4, 0, 0, 2, 4, 0, 5, 5, 0, 4, 2, 4, 4, 3, 4, 1, 6, 3, 5, 7, 4, 4, 5, 5, 3, 2, 4, 7, 4, 1, 4, 0, 4, 7, 4, 2, 4, 0, 4, 4, 4, 4, 5, 7, 5, 4, 0, 4, 4, 5, 0, 2, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 0, 4, 4, 4, 4, 5, 4, 2, 4, 5, 4, 4, 0, 2, 4, 4, 0, 1, 4, 4, 4, 4, 0, 4, 4, 2, 5, 4, 4, 5, 4, 4, 4, 4, 1, 4, 2, 5, 4, 1, 4, 7, 4, 2, 7, 4, 5, 1, 4, 1, 4, 4, 4, 4, 0, 4, 4, 5, 0, 2, 0, 7, 0, 7, 7, 0, 4, 

[6, 0, 0, 0, 3, 0, 3, 6, 6, 2, 6, 0, 3, 6, 0, 6, 0, 6, 1, 6, 6, 0, 0, 0, 5, 0, 2, 7, 2, 6, 3, 2, 3, 3, 3, 5, 0, 0, 6, 6, 0, 2, 6, 0, 2, 0, 0, 0, 2, 6, 0, 3, 6, 2, 1, 2, 6, 3, 6, 0, 0, 1, 4, 0, 6, 0, 0, 2, 3, 6, 0, 2, 0, 2, 6, 2, 0, 0, 0, 6, 7, 3, 2, 0, 0, 0, 1, 0, 2, 0, 0, 3, 3, 6, 0, 2, 6, 0, 2, 0, 0, 0, 6, 6, 1, 7, 7, 0, 0, 0, 0, 3, 0, 6, 0, 6, 2, 6, 6, 6, 6, 0, 0, 6, 0, 3, 6, 3, 6, 1, 6, 0, 0, 0, 1, 0, 6, 6, 0, 6, 0, 0, 1, 6, 2, 6, 3, 0, 0, 7, 0, 0, 0, 1, 7, 1, 2, 1, 0, 0, 2, 0, 3, 0, 3, 0, 6, 0, 6, 6, 0, 6, 2, 0, 0, 0, 1, 2, 0, 0, 3, 0, 3, 0, 4, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 6, 7, 2, 3, 0, 6, 0, 7, 1, 6, 0, 3, 2, 3, 0, 6, 4, 0, 1, 2, 0, 0, 6, 5, 6, 6, 0, 6, 2, 0, 0, 1, 0, 6, 6, 2, 1, 0, 2, 3, 4, 6, 0, 0, 0, 7, 6, 6, 5, 0, 2, 1, 6, 0, 0, 0, 6, 3, 0, 6, 3, 0, 0, 0, 0, 2, 0, 0, 0, 6, 3, 3, 7, 0, 0, 6, 0, 0, 6, 3, 0, 6, 0, 6, 2, 3, 0, 0, 1, 0, 2, 0, 0, 2, 0, 0, 1, 6, 2, 0, 1, 3, 6, 0, 6, 0, 0, 0, 0, 3, 0, 1, 0, 6, 2, 0, 2, 0, 3, 0, 0, 0, 0, 2, 6, 4, 0, 4, 0, 0, 3, 0, 0, 2, 5, 3, 0, 0, 

In [30]:
tmp = pd.concat([aus_disc_8intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_aus_8int.csv',index=False)

## 2.3 ChiMerge with 10 Intervals

In [31]:
#Chi Merge with 10 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=aus, attr=i, label='label', max_intervals=10)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created (sample size = 50)
aus_disc_10intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, aus) # musk 50: sample data
    print(v)
    aus_disc_10intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
aus_disc_10intervals = pd.DataFrame.from_dict(aus_disc_10intervals)

aus_disc_10intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[13.75,20.08]
[20.17,21.0]
[21.08,21.17]
[21.25,21.33]
[21.42,23.0]
[23.08,23.33]
[23.42,40.92]
[41.0,51.58]
[51.83,52.42]
[52.5,80.25]
[0.0,1.125]
[1.165,1.375]
[1.415,4.165]
[4.25,6.75]
[7.0,7.25]
[7.5,14.585]
[14.79,15.0]
[15.5,19.0]
[19.5,25.125]
[25.21,28.0]
[0.0,0.04]
[0.085,0.455]
[0.46,0.46]
[0.5,0.585]
[0.625,0.625]
[0.665,1.375]
[1.415,1.46]
[1.5,4.0]
[4.165,13.875]
[14.0,28.5]
[0,0]
[1,2]
[3,4]
[5,7]
[8,9]
[10,10]
[11,12]
[13,19]
[20,20]
[23,67]
[0,102]
[108,168]
[170,180]
[181,369]
[370,399]
[400,420]
[422,434]
[440,480]
[487,520]
[523,2000]
[1,1]
[2,6]
[7,56]
[59,60]
[61,99]
[100,287]
[301,328]
[341,365]
[368,5553]
[5778,100001]
A2
10
A3
10
A7
10
A10
10
A13
10
A14
10
A2
[4, 4, 6, 4, 1, 0, 0, 9, 6, 9, 6, 7, 1, 6, 9, 7, 6, 0, 0, 4, 6, 0, 7, 7, 0, 6, 4, 6, 6, 5, 6, 1, 8, 5, 7, 9, 6, 6, 7, 7, 5, 4, 6, 9, 6, 1, 6, 0, 6, 9, 6, 2, 6, 0, 6, 6, 6, 6, 7, 9, 7, 6, 0, 6, 6, 7, 0, 4, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 0, 6, 6, 6, 6, 7, 6, 4, 6, 7, 6, 6, 0, 3, 6, 6, 0, 1, 6, 6, 6

[0, 1, 3, 0, 0, 0, 0, 0, 2, 0, 3, 7, 1, 0, 3, 0, 3, 0, 1, 1, 3, 1, 1, 1, 0, 0, 3, 2, 3, 0, 3, 1, 3, 3, 3, 0, 0, 0, 0, 8, 0, 1, 0, 2, 1, 0, 1, 0, 3, 0, 3, 1, 3, 1, 3, 6, 0, 3, 0, 3, 1, 3, 1, 9, 3, 0, 4, 0, 2, 9, 0, 0, 3, 1, 0, 7, 3, 3, 0, 1, 0, 0, 0, 9, 3, 3, 3, 0, 3, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 0, 7, 3, 8, 3, 0, 1, 1, 0, 0, 3, 1, 0, 0, 0, 5, 3, 0, 3, 0, 3, 0, 3, 0, 0, 3, 5, 0, 3, 0, 1, 1, 3, 3, 0, 1, 1, 3, 3, 3, 4, 1, 3, 2, 0, 1, 0, 0, 1, 3, 4, 3, 1, 9, 3, 0, 8, 3, 1, 0, 8, 7, 0, 3, 3, 0, 3, 0, 1, 3, 1, 3, 0, 5, 1, 1, 9, 0, 0, 3, 0, 0, 3, 1, 0, 1, 3, 3, 0, 3, 3, 1, 1, 2, 3, 2, 3, 3, 1, 1, 9, 0, 1, 1, 3, 0, 3, 1, 3, 5, 1, 4, 2, 0, 1, 2, 0, 1, 3, 3, 8, 0, 0, 0, 7, 9, 3, 0, 0, 0, 1, 0, 0, 2, 0, 3, 0, 3, 0, 3, 1, 0, 1, 0, 0, 3, 3, 3, 0, 3, 3, 0, 1, 0, 1, 0, 0, 6, 3, 3, 0, 0, 0, 2, 4, 0, 0, 2, 8, 3, 0, 0, 3, 1, 3, 0, 7, 3, 3, 1, 1, 3, 0, 5, 3, 3, 4, 0, 0, 0, 1, 5, 3, 3, 9, 0, 2, 3, 3, 7, 1, 0, 1, 3, 1, 0, 4, 3, 0, 3, 5, 3, 0, 1, 0, 0, 0, 9, 1, 1, 0, 0, 0, 0, 0, 3, 8, 3, 0, 1, 0, 0, 1, 1, 

In [32]:
tmp = pd.concat([aus_disc_10intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_aus_10int.csv',index=False)

In [33]:
tmp

Unnamed: 0,A2,A3,A7,A10,A13,A14,label
0,4,5,7,0,0,8,0
1,4,4,1,0,1,0,0
2,6,2,5,0,3,0,0
3,4,5,0,6,0,0,1
4,1,5,7,7,0,5,1
...,...,...,...,...,...,...,...
685,6,5,8,0,0,0,1
686,1,0,1,0,0,2,0
687,0,5,1,0,0,0,1
688,6,5,7,1,1,2,1


## 2.4 ChiMerge with 15 Intervals

In [34]:
#Chi Merge with 15 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=aus, attr=i, label='label', max_intervals=15)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created
aus_disc_15intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, aus) 
    print(v)
    aus_disc_15intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
aus_disc_15intervals = pd.DataFrame.from_dict(aus_disc_15intervals)

aus_disc_15intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[13.75,16.17]
[16.25,17.42]
[17.5,20.08]
[20.17,21.0]
[21.08,21.17]
[21.25,21.33]
[21.42,23.0]
[23.08,23.33]
[23.42,23.75]
[23.92,28.17]
[28.25,28.42]
[28.5,40.92]
[41.0,51.58]
[51.83,52.42]
[52.5,80.25]
[0.0,0.29]
[0.335,0.335]
[0.375,0.58]
[0.585,0.79]
[0.83,1.125]
[1.165,1.375]
[1.415,2.125]
[2.165,4.165]
[4.25,6.75]
[7.0,7.25]
[7.5,14.585]
[14.79,15.0]
[15.5,19.0]
[19.5,25.125]
[25.21,28.0]
[0.0,0.04]
[0.085,0.455]
[0.46,0.46]
[0.5,0.585]
[0.625,0.625]
[0.665,1.0]
[1.04,1.085]
[1.165,1.165]
[1.21,1.375]
[1.415,1.46]
[1.5,3.085]
[3.125,3.17]
[3.25,4.0]
[4.165,13.875]
[14.0,28.5]
[0,0]
[1,1]
[2,2]
[3,3]
[4,4]
[5,5]
[6,6]
[7,7]
[8,9]
[10,10]
[11,11]
[12,12]
[13,19]
[20,20]
[23,67]
[0,102]
[108,168]
[170,180]
[181,195]
[200,288]
[290,312]
[320,329]
[330,333]
[340,369]
[370,399]
[400,420]
[422,434]
[440,480]
[487,520]
[523,2000]
[1,1]
[2,6]
[7,14]
[15,16]
[17,56]
[59,60]
[61,99]
[100,287]
[301,328]
[341,365]
[368,3377]
[3553,4501]
[4608,5125]
[5201,5553]
[5778,100001]
A2
15
A3
15
A7
15


[10, 0, 0, 0, 7, 0, 7, 10, 10, 4, 10, 0, 7, 10, 0, 10, 0, 10, 1, 10, 10, 0, 0, 0, 9, 0, 6, 14, 4, 10, 7, 4, 7, 7, 7, 9, 0, 0, 10, 10, 0, 6, 13, 0, 2, 0, 0, 0, 4, 10, 0, 7, 10, 2, 1, 4, 10, 7, 10, 0, 0, 1, 8, 0, 10, 0, 0, 6, 7, 10, 0, 4, 0, 2, 10, 4, 0, 0, 0, 10, 14, 7, 4, 0, 0, 0, 1, 0, 2, 0, 0, 7, 7, 10, 0, 2, 10, 0, 2, 0, 0, 0, 10, 10, 1, 14, 14, 0, 0, 0, 0, 7, 0, 10, 0, 10, 4, 11, 10, 10, 10, 0, 0, 10, 0, 7, 10, 7, 10, 1, 10, 0, 0, 0, 1, 0, 10, 12, 0, 11, 0, 0, 1, 10, 6, 10, 7, 0, 0, 14, 0, 0, 0, 1, 14, 1, 4, 1, 0, 0, 4, 0, 7, 0, 7, 0, 11, 0, 10, 10, 0, 10, 2, 0, 0, 0, 1, 4, 0, 0, 7, 0, 7, 0, 8, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 10, 14, 2, 7, 0, 10, 0, 14, 1, 10, 0, 7, 4, 7, 0, 10, 8, 0, 1, 6, 0, 0, 10, 9, 10, 10, 0, 12, 6, 0, 0, 1, 0, 10, 10, 2, 1, 0, 3, 7, 8, 10, 0, 0, 0, 14, 10, 12, 9, 0, 4, 1, 10, 0, 0, 0, 10, 7, 0, 10, 7, 0, 0, 0, 0, 4, 0, 0, 0, 10, 7, 7, 14, 0, 0, 10, 0, 0, 10, 7, 0, 10, 0, 10, 3, 7, 0, 0, 1, 0, 4, 0, 0, 4, 0, 0, 1, 10, 5, 0, 1, 7, 10, 0, 10, 0, 0, 0, 0, 7, 0, 1, 

In [35]:
tmp = pd.concat([aus_disc_15intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_aus_15int.csv',index=False)

In [36]:
tmp

Unnamed: 0,A2,A3,A7,A10,A13,A14,label
0,6,10,10,0,0,10,0
1,6,9,1,0,1,0,0
2,11,6,8,0,4,0,0
3,6,10,0,10,0,0,1
4,3,10,10,12,0,7,1
...,...,...,...,...,...,...,...
685,11,10,13,0,0,0,1
686,3,2,1,0,0,4,0
687,2,10,1,0,0,0,1
688,9,10,10,1,1,2,1
