# Discretization of pre-processed data
## Dataset: pima
By: Sam
Replicate using Malina script
Update: 13/10/22

### About Dataset
Therefore, there is one target (dependent) variable and the 8 attributes (TYNECKI, 2018): 
- pregnancies, 
- OGTT(Oral Glucose Tolerance Test), 
- blood pressure, 
- skin thickness, 
- insulin, 
- BMI(Body Mass Index), 
- age, 
- pedigree diabetes function

# 1. Preparing data

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [2]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_pima.csv')
#pima dataset
pima = data0

In [3]:
pima

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'Outcome'.
pima['Outcome']= label_encoder.fit_transform(pima['Outcome'])
  
pima['Outcome'].unique()

array([1, 0], dtype=int64)

In [5]:
# get list of numeric attributes to discretize
num_list = pima.drop(['Outcome'],axis=1).columns

In [6]:
num_list

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [7]:
y_list = pd.DataFrame(pima['Outcome'])

In [8]:
num_list
y_list

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


# 2. Chi Merge  discretization implementation 1 based on Kerber 1992
Source: https://gist.github.com/alanzchen/17d0c4a45d59b79052b1cd07f531689e

In [9]:
#Define function for chiMerge
def chimerge(data, attr, label, max_intervals):
    distinct_vals = sorted(set(data[attr])) # Sort the distinct values
    labels = sorted(set(data[label])) # Get all possible labels
    empty_count = {l: 0 for l in labels} # A helper function for padding the Counter()
    intervals = [[distinct_vals[i], distinct_vals[i]] for i in range(len(distinct_vals))] # Initialize the intervals for each attribute
    while len(intervals) > max_intervals: # While loop
        chi = []
        for i in range(len(intervals)-1):
            # Calculate the Chi2 value
            obs0 = data[data[attr].between(intervals[i][0], intervals[i][1])]
            obs1 = data[data[attr].between(intervals[i+1][0], intervals[i+1][1])]
            total = len(obs0) + len(obs1)
            count_0 = np.array([v for i, v in {**empty_count, **Counter(obs0[label])}.items()])
            count_1 = np.array([v for i, v in {**empty_count, **Counter(obs1[label])}.items()])
            count_total = count_0 + count_1
            expected_0 = count_total*sum(count_0)/total
            expected_1 = count_total*sum(count_1)/total
            chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1
            chi_ = np.nan_to_num(chi_) # Deal with the zero counts
            chi.append(sum(chi_)) # Finally do the summation for Chi2
        min_chi = min(chi) # Find the minimal Chi2 for current iteration
        for i, v in enumerate(chi):
            if v == min_chi:
                min_chi_index = i # Find the index of the interval to be merged
                break
        new_intervals = [] # Prepare for the merged new data array
        skip = False
        done = False
        for i in range(len(intervals)):
            if skip:
                skip = False
                continue
            if i == min_chi_index and not done: # Merge the intervals
                t = intervals[i] + intervals[i+1]
                new_intervals.append([min(t), max(t)])
                skip = True
                done = True
            else:
                new_intervals.append(intervals[i])
        intervals = new_intervals
    for i in intervals:
        print('[', i[0], ',', i[1], ']', sep='')
    return intervals
        

In [10]:
# DEBUG CHIMAP FUNCTION: Mapping interval to dataframe 
def chi_map (attr, data):
    disc_list = []
    for j in range(0,len(data)):
        #print('Raw value:', data[attr].loc[j])
        for i in range (0,len(chi_intervals[attr])):      
            if (chi_intervals[attr][i][0] <= data[attr].loc[j]) and (data[attr].loc[j] <= chi_intervals[attr][i][1]):
                disc_list.append(i)
    return disc_list

## 2.1 Chi merge with 6 intervals

In [11]:
#Chi Merge with 6 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=pima, attr=i, label='Outcome', max_intervals=6)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created
pima_disc_6intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, pima)
    print(v)
    pima_disc_6intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
pima_disc_6intervals = pd.DataFrame.from_dict(pima_disc_6intervals)

pima_disc_6intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[0,0]
[1,2]
[3,6]
[7,9]
[10,13]
[14,17]
[0,0]
[44,99]
[100,127]
[128,154]
[155,166]
[167,199]
[0,40]
[44,61]
[62,75]
[76,100]
[102,104]
[106,122]
[0,7]
[8,23]
[24,31]
[32,51]
[52,54]
[56,99]
[0,14]
[15,87]
[88,95]
[96,99]
[100,112]
[114,846]
[0.0,26.3]
[26.4,32.8]
[32.9,32.9]
[33.1,33.2]
[33.3,47.9]
[48.3,67.1]
[0.078,0.271]
[0.272,0.278]
[0.279,1.174]
[1.182,1.394]
[1.4,1.781]
[1.893,2.42]
[21,24]
[25,30]
[31,42]
[43,54]
[55,62]
[63,81]
Pregnancies
6
Glucose
6
BloodPressure
6
SkinThickness
6
Insulin
6
BMI
6
DiabetesPedigreeFunction
6
Age
6
Pregnancies
[2, 1, 3, 1, 0, 2, 2, 4, 1, 3, 2, 4, 4, 1, 2, 3, 0, 3, 1, 1, 2, 3, 3, 3, 4, 4, 3, 1, 4, 2, 2, 2, 2, 2, 4, 2, 4, 3, 1, 2, 2, 3, 3, 3, 3, 0, 1, 1, 3, 3, 1, 1, 2, 3, 3, 1, 3, 0, 0, 0, 1, 3, 2, 1, 3, 2, 0, 1, 1, 2, 1, 2, 4, 2, 1, 1, 3, 2, 0, 1, 2, 1, 3, 0, 2, 1, 4, 1, 5, 1, 1, 2, 3, 2, 1, 2, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 2, 0, 2, 3, 1, 2, 3, 2, 2, 2, 2, 2, 0, 2, 1, 2, 0, 1, 2, 1, 1, 0, 2, 3, 2, 3, 1, 1, 0, 0, 0, 2, 2, 2, 1, 4, 2, 0, 3,

[4, 1, 0, 1, 4, 0, 1, 4, 1, 0, 4, 4, 1, 1, 0, 1, 4, 1, 4, 4, 4, 4, 4, 1, 4, 1, 4, 0, 0, 4, 4, 1, 0, 0, 1, 0, 3, 2, 4, 4, 4, 4, 0, 4, 1, 4, 1, 1, 4, 0, 0, 0, 0, 4, 4, 0, 4, 4, 4, 4, 0, 2, 0, 0, 1, 1, 1, 4, 0, 1, 2, 1, 4, 4, 1, 0, 1, 4, 4, 0, 0, 0, 1, 0, 5, 1, 4, 4, 4, 1, 0, 1, 4, 0, 0, 4, 1, 0, 1, 5, 4, 0, 0, 1, 4, 1, 0, 1, 4, 4, 4, 4, 1, 4, 1, 1, 4, 4, 1, 0, 5, 4, 4, 1, 4, 5, 4, 4, 4, 1, 1, 4, 4, 4, 0, 4, 1, 1, 1, 4, 0, 4, 1, 1, 1, 0, 1, 1, 4, 1, 4, 0, 4, 4, 4, 5, 0, 0, 1, 4, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 1, 1, 1, 5, 4, 4, 0, 4, 1, 1, 1, 4, 1, 1, 1, 1, 0, 3, 1, 5, 0, 4, 0, 0, 4, 1, 1, 4, 1, 0, 4, 0, 4, 4, 3, 4, 1, 4, 4, 4, 4, 4, 4, 1, 1, 4, 4, 1, 0, 1, 0, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 1, 0, 1, 3, 0, 1, 4, 1, 1, 5, 4, 1, 1, 1, 0, 4, 1, 4, 1, 1, 0, 4, 1, 1, 1, 1, 1, 4, 4, 4, 0, 1, 4, 0, 0, 3, 4, 4, 1, 1, 0, 0, 4, 4, 1, 1, 1, 0, 4, 4, 0, 4, 4, 4, 4, 4, 0, 4, 1, 1, 4, 0, 1, 1, 4, 5, 0, 4, 0, 0, 1, 2, 0, 4, 1, 1, 4, 4, 0, 1, 4, 0, 1, 1, 1, 1, 4, 0, 4, 4, 4, 1, 0, 1, 4, 

In [13]:
tmp = pd.concat([pima_disc_6intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_pima_6int.csv',index=False)

In [14]:
tmp

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,3,2,3,0,4,2,3,1
1,1,1,2,2,0,1,2,2,0
2,3,5,2,0,0,0,2,2,1
3,1,1,2,1,2,1,0,0,0
4,0,3,0,3,5,4,5,2,1
...,...,...,...,...,...,...,...,...,...
763,4,2,3,3,5,2,0,5,0
764,1,2,2,2,0,4,2,1,0
765,2,2,2,1,4,0,0,1,0
766,1,2,1,0,0,1,2,3,1


## 2.2 ChiMerge with 8 Intervals

In [15]:
#Chi Merge with 8 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=pima, attr=i, label='Outcome', max_intervals=8)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created
pima_disc_8intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, pima)
    print(v)
    pima_disc_8intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
pima_disc_8intervals = pd.DataFrame.from_dict(pima_disc_8intervals)

pima_disc_8intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[0,0]
[1,2]
[3,6]
[7,9]
[10,10]
[11,11]
[12,13]
[14,17]
[0,0]
[44,99]
[100,114]
[115,115]
[116,127]
[128,154]
[155,166]
[167,199]
[0,40]
[44,46]
[48,54]
[55,61]
[62,75]
[76,100]
[102,104]
[106,122]
[0,7]
[8,23]
[24,31]
[32,49]
[50,50]
[51,51]
[52,54]
[56,99]
[0,14]
[15,79]
[81,87]
[88,91]
[92,95]
[96,99]
[100,112]
[114,846]
[0.0,26.3]
[26.4,32.8]
[32.9,32.9]
[33.1,33.2]
[33.3,45.3]
[45.4,45.8]
[46.1,47.9]
[48.3,67.1]
[0.078,0.126]
[0.127,0.271]
[0.272,0.278]
[0.279,0.536]
[0.537,1.174]
[1.182,1.394]
[1.4,1.781]
[1.893,2.42]
[21,21]
[22,24]
[25,30]
[31,42]
[43,54]
[55,62]
[63,65]
[66,81]
Pregnancies
8
Glucose
8
BloodPressure
8
SkinThickness
8
Insulin
8
BMI
8
DiabetesPedigreeFunction
8
Age
8
Pregnancies
[2, 1, 3, 1, 0, 2, 2, 4, 1, 3, 2, 4, 4, 1, 2, 3, 0, 3, 1, 1, 2, 3, 3, 3, 5, 4, 3, 1, 6, 2, 2, 2, 2, 2, 4, 2, 5, 3, 1, 2, 2, 3, 3, 3, 3, 0, 1, 1, 3, 3, 1, 1, 2, 3, 3, 1, 3, 0, 0, 0, 1, 3, 2, 1, 3, 2, 0, 1, 1, 2, 1, 2, 6, 2, 1, 1, 3, 2, 0, 1, 2, 1, 3, 0, 2, 1, 6, 1, 7, 1, 1, 2, 3, 2, 1, 2, 

[0, 0, 0, 4, 7, 0, 3, 0, 7, 0, 0, 0, 0, 7, 7, 0, 7, 0, 2, 5, 7, 0, 0, 0, 7, 7, 0, 7, 6, 0, 0, 7, 1, 0, 0, 7, 0, 0, 0, 7, 1, 0, 0, 7, 0, 0, 0, 0, 0, 0, 2, 1, 1, 7, 7, 0, 7, 6, 0, 7, 0, 0, 0, 7, 0, 0, 0, 0, 1, 6, 3, 7, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0, 1, 6, 0, 0, 7, 1, 0, 1, 7, 0, 1, 1, 7, 0, 0, 0, 1, 0, 7, 0, 7, 1, 1, 7, 7, 1, 0, 7, 0, 0, 0, 0, 1, 6, 0, 6, 0, 0, 5, 7, 4, 7, 0, 7, 0, 7, 0, 1, 7, 1, 4, 0, 7, 0, 0, 1, 0, 7, 0, 0, 7, 0, 0, 7, 0, 7, 7, 0, 0, 4, 7, 1, 7, 0, 6, 7, 0, 0, 7, 0, 0, 0, 1, 0, 7, 0, 1, 1, 7, 0, 7, 0, 0, 0, 4, 1, 0, 0, 0, 7, 1, 7, 7, 0, 4, 0, 0, 0, 7, 0, 1, 5, 7, 0, 0, 0, 1, 7, 0, 7, 0, 2, 0, 0, 0, 0, 7, 7, 7, 7, 7, 0, 0, 7, 0, 0, 7, 1, 1, 0, 0, 7, 1, 0, 7, 1, 0, 1, 0, 7, 0, 0, 0, 0, 3, 0, 7, 7, 0, 0, 7, 7, 0, 0, 0, 1, 0, 7, 0, 0, 0, 7, 7, 7, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 7, 0, 7, 0, 7, 7, 0, 0, 7, 7, 7, 1, 1, 1, 1, 7, 7, 0, 7, 7, 7, 7, 0, 0, 7, 1, 0, 0, 6, 7, 7, 7, 7, 0, 7, 5, 2, 0, 4, 1, 0, 7, 0, 7, 0, 0, 1, 0, 7, 7, 0, 7, 1, 0, 1, 0, 

In [17]:
tmp = pd.concat([pima_disc_8intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_pima_8int.csv',index=False)

In [18]:
tmp

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,5,4,3,0,4,4,4,1
1,1,1,4,2,0,1,3,3,0
2,3,7,4,0,0,0,4,3,1
3,1,1,4,1,4,1,1,0,0
4,0,5,0,3,7,4,7,3,1
...,...,...,...,...,...,...,...,...,...
763,4,2,5,3,7,2,1,6,0
764,1,4,4,2,0,4,3,2,0
765,2,4,4,1,6,0,1,2,0
766,1,4,3,0,0,1,3,4,1


## 2.3 ChiMerge with 10 Intervals

In [19]:
#Chi Merge with 10 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=pima, attr=i, label='Outcome', max_intervals=10)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created (sample size = 50)
pima_disc_10intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, pima) # musk 50: sample data
    print(v)
    pima_disc_10intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
pima_disc_10intervals = pd.DataFrame.from_dict(pima_disc_10intervals)

pima_disc_10intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[0,0]
[1,1]
[2,2]
[3,6]
[7,8]
[9,9]
[10,10]
[11,11]
[12,13]
[14,17]
[0,0]
[44,99]
[100,114]
[115,115]
[116,123]
[124,125]
[126,127]
[128,154]
[155,166]
[167,199]
[0,40]
[44,46]
[48,54]
[55,61]
[62,68]
[70,74]
[75,75]
[76,100]
[102,104]
[106,122]
[0,7]
[8,13]
[14,23]
[24,31]
[32,48]
[49,49]
[50,50]
[51,51]
[52,54]
[56,99]
[0,14]
[15,57]
[58,78]
[79,79]
[81,87]
[88,91]
[92,95]
[96,99]
[100,112]
[114,846]
[0.0,26.3]
[26.4,32.8]
[32.9,32.9]
[33.1,33.2]
[33.3,42.8]
[42.9,44.5]
[44.6,45.3]
[45.4,45.8]
[46.1,47.9]
[48.3,67.1]
[0.078,0.126]
[0.127,0.129]
[0.13,0.271]
[0.272,0.278]
[0.279,0.536]
[0.537,0.543]
[0.545,1.174]
[1.182,1.394]
[1.4,1.781]
[1.893,2.42]
[21,21]
[22,24]
[25,30]
[31,42]
[43,43]
[44,48]
[49,54]
[55,62]
[63,65]
[66,81]
Pregnancies
10
Glucose
10
BloodPressure
10
SkinThickness
10
Insulin
10
BMI
10
DiabetesPedigreeFunction
10
Age
10
Pregnancies
[3, 1, 4, 1, 0, 3, 3, 6, 2, 4, 3, 6, 6, 1, 3, 4, 0, 4, 1, 1, 3, 4, 4, 5, 7, 6, 4, 1, 8, 3, 3, 3, 3, 3, 6, 3, 7, 5, 2, 3, 3, 4, 4, 5, 4

[4, 1, 0, 1, 5, 0, 1, 4, 1, 0, 4, 4, 1, 1, 0, 1, 7, 1, 5, 4, 4, 4, 4, 1, 4, 1, 4, 0, 0, 4, 4, 1, 0, 0, 1, 0, 3, 2, 4, 4, 4, 4, 0, 7, 1, 4, 1, 1, 4, 0, 0, 0, 0, 4, 4, 0, 4, 8, 4, 4, 0, 2, 0, 0, 1, 1, 1, 4, 0, 1, 2, 1, 5, 4, 1, 0, 1, 4, 5, 0, 0, 0, 1, 0, 9, 1, 4, 4, 4, 1, 0, 1, 8, 0, 0, 4, 1, 0, 1, 9, 4, 0, 0, 1, 4, 1, 0, 1, 4, 4, 4, 4, 1, 4, 1, 1, 4, 4, 1, 0, 9, 4, 4, 1, 4, 9, 5, 4, 4, 1, 1, 4, 4, 4, 0, 4, 1, 1, 1, 4, 0, 4, 1, 1, 1, 0, 1, 1, 4, 1, 4, 0, 4, 4, 8, 9, 0, 0, 1, 4, 1, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 5, 1, 1, 1, 9, 6, 4, 0, 4, 1, 1, 1, 4, 1, 1, 1, 1, 0, 3, 1, 9, 0, 4, 0, 0, 4, 1, 1, 4, 1, 0, 4, 0, 4, 4, 3, 4, 1, 4, 4, 4, 4, 4, 4, 1, 1, 4, 4, 1, 0, 1, 0, 4, 4, 4, 4, 6, 5, 8, 0, 4, 1, 5, 4, 5, 1, 0, 1, 3, 0, 1, 4, 1, 1, 9, 4, 1, 1, 1, 0, 4, 1, 4, 1, 1, 0, 4, 1, 1, 1, 1, 1, 4, 4, 4, 0, 1, 7, 0, 0, 3, 4, 4, 1, 1, 0, 0, 4, 4, 1, 1, 1, 0, 4, 7, 0, 4, 4, 4, 5, 4, 0, 4, 1, 1, 4, 0, 1, 1, 4, 9, 0, 4, 0, 0, 1, 2, 0, 4, 1, 1, 4, 4, 0, 1, 4, 0, 1, 1, 1, 1, 4, 0, 4, 4, 7, 1, 0, 1, 5, 

In [20]:
tmp = pd.concat([pima_disc_10intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_pima_10int.csv',index=False)

In [21]:
tmp

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,3,7,5,4,0,4,6,6,1
1,1,1,4,3,0,1,4,3,0
2,4,9,4,0,0,0,6,3,1
3,1,1,4,2,6,1,2,0,0
4,0,7,0,4,9,5,9,3,1
...,...,...,...,...,...,...,...,...,...
763,6,2,7,4,9,2,2,8,0
764,2,4,5,3,0,4,4,2,0
765,3,4,5,2,8,0,2,2,0
766,1,6,3,0,0,1,4,5,1


## 2.4 ChiMerge with 15 Intervals

In [22]:
#Chi Merge with 15 intervals
import time
start = time.time() # For measuring time execution

# Create a dictionary for saving intervals
chi_intervals = {} 
for i in num_list:
    interval = chimerge(data=pima, attr=i, label='Outcome', max_intervals=15)
    chi_intervals[i]=interval
    
# For checking result: Get the number of intervals after chi merge discretization for each attribute
for k, v in chi_intervals.items():
    print(k)
    print(len(v)) # number of intervals
    

# Mapping original data to the intervals created
pima_disc_15intervals = {}
for k in num_list:
    print(k)
    v = chi_map(k, pima) 
    print(v)
    pima_disc_15intervals[k]=v


# Convert the discrete output to dataframe.
# For training, need to merge the label column of the original dataframe to this one
pima_disc_15intervals = pd.DataFrame.from_dict(pima_disc_15intervals)

pima_disc_15intervals.head(10)

end = time.time()
print(end - start) # Total time execution for this sample

#convert to csv file

[0,0]
[1,1]
[2,2]
[3,3]
[4,4]
[5,5]
[6,6]
[7,7]
[8,8]
[9,9]
[10,10]
[11,11]
[12,12]
[13,13]
[14,17]


  chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1


[0,0]
[44,77]
[78,99]
[100,114]
[115,115]
[116,123]
[124,125]
[126,127]
[128,152]
[153,154]
[155,156]
[157,157]
[158,164]
[165,166]
[167,199]
[0,38]
[40,40]
[44,46]
[48,54]
[55,58]
[60,61]
[62,68]
[70,74]
[75,75]
[76,76]
[78,80]
[82,100]
[102,104]
[106,114]
[122,122]
[0,7]
[8,13]
[14,14]
[15,16]
[17,23]
[24,31]
[32,40]
[41,48]
[49,49]
[50,50]
[51,51]
[52,54]
[56,56]
[60,60]
[63,99]
[0,14]
[15,46]
[48,48]
[49,57]
[58,58]
[59,78]
[79,79]
[81,87]
[88,91]
[92,95]
[96,99]
[100,112]
[114,135]
[140,142]
[144,846]
[0.0,0.0]
[18.2,22.7]
[22.9,23.2]
[23.3,23.5]
[23.6,26.3]
[26.4,28.8]
[28.9,32.8]
[32.9,32.9]
[33.1,33.2]
[33.3,42.8]
[42.9,44.5]
[44.6,45.3]
[45.4,45.8]
[46.1,47.9]
[48.3,67.1]
[0.078,0.126]
[0.127,0.129]
[0.13,0.238]
[0.239,0.241]
[0.243,0.271]
[0.272,0.278]
[0.279,0.295]
[0.296,0.297]
[0.299,0.3]
[0.302,0.536]
[0.537,0.543]
[0.545,1.174]
[1.182,1.394]
[1.4,1.781]
[1.893,2.42]
[21,21]
[22,24]
[25,30]
[31,33]
[34,34]
[35,36]
[37,37]
[38,42]
[43,43]
[44,47]
[48,48]
[49,54]
[55,62]
[6

[0, 0, 0, 9, 14, 0, 8, 0, 14, 0, 0, 0, 0, 14, 14, 0, 14, 0, 7, 10, 14, 0, 0, 0, 14, 12, 0, 13, 11, 0, 0, 14, 3, 0, 0, 14, 0, 0, 0, 14, 5, 0, 0, 14, 0, 0, 0, 0, 0, 0, 7, 1, 1, 14, 14, 0, 14, 11, 0, 13, 0, 0, 0, 12, 0, 0, 0, 0, 1, 11, 8, 13, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 12, 0, 5, 11, 0, 0, 14, 2, 0, 5, 14, 0, 5, 5, 14, 0, 0, 0, 1, 0, 14, 0, 13, 1, 1, 12, 14, 1, 0, 14, 0, 0, 0, 0, 3, 11, 0, 11, 0, 0, 10, 12, 9, 14, 0, 14, 0, 14, 0, 3, 13, 3, 9, 0, 14, 0, 0, 5, 0, 14, 0, 0, 12, 0, 0, 14, 0, 14, 14, 0, 0, 9, 12, 3, 12, 0, 11, 14, 0, 0, 14, 0, 0, 0, 5, 0, 12, 0, 2, 3, 12, 0, 12, 0, 0, 0, 9, 1, 0, 0, 0, 14, 4, 12, 14, 0, 9, 0, 0, 0, 14, 0, 2, 10, 14, 0, 0, 0, 1, 14, 0, 14, 0, 7, 0, 0, 0, 0, 12, 14, 14, 12, 12, 0, 0, 14, 0, 0, 14, 3, 1, 0, 0, 14, 3, 0, 14, 1, 0, 1, 0, 14, 0, 0, 0, 0, 8, 0, 14, 14, 0, 0, 14, 14, 0, 0, 0, 3, 0, 14, 0, 0, 0, 14, 14, 12, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 3, 0, 1, 0, 3, 0, 12, 0, 14, 0, 12, 14, 0, 0, 12, 14, 14, 3, 5, 1, 5, 14, 14, 0, 12, 14, 14, 14, 0, 0, 1

In [23]:
tmp = pd.concat([pima_disc_15intervals,y_list], axis=1)
tmp
# Export this dataset for discretization
#convert to csv file
tmp.to_csv('chim_pima_15int.csv',index=False)

In [24]:
tmp

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,8,7,6,0,9,11,11,1
1,1,2,6,5,0,5,9,3,0
2,8,14,6,0,0,3,11,3,1
3,1,2,6,4,9,5,2,0,0
4,0,8,1,6,14,10,14,3,1
...,...,...,...,...,...,...,...,...,...
763,10,3,9,7,14,7,2,13,0
764,2,5,7,5,0,9,9,2,0
765,5,5,7,4,11,4,4,2,0
766,1,7,5,0,0,6,9,9,1
