In [37]:
from IPython.display import display
display({'text/html': '<h1>Topic1 Assignment</h1>', 'text/plain': 'Topic1 Assignment'}, raw=True)
display({'text/html': '<h1>Anoop Korappath</h1>', 'text/plain': 'Anoop Korappath'}, raw=True)
display({'text/html': '<h1>05/04/2022</h1>', 'text/plain': '05/04/2022'}, raw=True)

#### Import Libraries

In [38]:
import pandas as pd
import numpy as np
import math

#### Read data set

In [39]:
data_iris = pd.read_csv('iris.csv', header=None)
data_iris.columns = ['sepal_length', 'sepal_width',
                'petal_length', 'petal_width', 'target_class']


#### Chi Merge function

In [40]:
def do_chimerge(feature, data, max_interval):
    df = data.sort_values(by=[feature],ascending=True).reset_index()

    #create frequency table
    contingency_matrix = calculate_contingency_matrix(df,feature)

    #Determine the number of intervals to be used as a starting point.
    # Initially, each entry is an interval on its own.
    num_intervals= contingency_matrix.shape[0]

    # Continue looping until the maximum interval criterion is met.
    while num_intervals > max_interval:
        #each neighboring row pairs, compute the chi square
        chi2_df = update_chi2(contingency_matrix,feature)

        #Merge rows based on the chi square values that are the lowest.
        contingency_matrix = perform_merge(chi2_df,feature)

        #number of intervals to be updated
        num_intervals= contingency_matrix.shape[0]

    # Results should be printed.
    print('The split points for '+feature+' are:')
    for index, row in contingency_matrix.iterrows():
        print(contingency_matrix.loc[index][feature])

    print('The final intervals for '+feature+' are:')
    for index, row in contingency_matrix.iterrows():
        if(index!=contingency_matrix.shape[0]-1):
            for index2, row2 in df.iterrows():
                if df.loc[index2][feature]<contingency_matrix.loc[index+1][feature]:
                    temp = df.loc[index2][feature]
        else:
            temp = df[feature].iloc[-1]
        print("["+str(contingency_matrix.loc[index][feature])+","+str(temp)+"]")
    print(" ")

#### Function to calculate the frequency table

In [41]:
def calculate_contingency_matrix(dataframe, feature):
    distinct_values = sorted(set(dataframe[feature]), reverse=False)
    col_names =  [feature,'Iris-setosa', 'Iris-versicolor','Iris-virginica','chi2']
    my_contingency  = pd.DataFrame(columns = col_names)

    #The different attribute values are as follows:
    for i in range(len(distinct_values)):
        temp_df=dataframe.loc[dataframe[feature]==distinct_values[i]]
        count_dict = temp_df["target_class"].value_counts().to_dict()

        #Begin with zero frequencies.
        setosa_count = 0
        versicolor_count = 0
        virginica_count = 0

        #if required, update
        if 'Iris-setosa' in count_dict:
            setosa_count = count_dict['Iris-setosa']
        if 'Iris-versicolor' in count_dict:
            versicolor_count = count_dict['Iris-versicolor']
        if 'Iris-virginica' in count_dict:
            virginica_count = count_dict['Iris-virginica']

        new_row = [distinct_values[i],setosa_count,versicolor_count,virginica_count,0]
        my_contingency.loc[len(my_contingency)] = new_row

    return my_contingency

#### Function to calculate chi square values for each row pair

In [42]:
def update_chi2(contingency_matrix, feature):

    for index, row in contingency_matrix.iterrows():
        #We don't want to work on the very last row only
        if index!=contingency_matrix.shape[0]-1:

            # at a time, construct an array containing two rows of data
            list1=[]
            list2=[]
            list1.append(contingency_matrix.loc[index]['Iris-setosa'])
            list1.append(contingency_matrix.loc[index]['Iris-versicolor'])
            list1.append(contingency_matrix.loc[index]['Iris-virginica'])
            list2.append(contingency_matrix.loc[index+1]['Iris-setosa'])
            list2.append(contingency_matrix.loc[index+1]['Iris-versicolor'])
            list2.append(contingency_matrix.loc[index+1]['Iris-virginica'])
            prep_chi2 = np.array([np.array(list1),np.array(list2)])

            #Calculate the chi square values in real time.
            c2 = calculate_chi2(prep_chi2)

            #dataframe should be updated
            contingency_matrix.loc[index]['chi2'] = c2
    return contingency_matrix


#### Chi-square function

In [43]:
def calculate_chi2(array):
    shape = array.shape
    n = float(array.sum()) #total amount of submissions
    row={}
    column={}

    #locate row-by-row summaries
    for i in range(shape[0]):
        row[i] = array[i].sum()

    #locate column-by-column summaries
    for j in range(shape[1]):
        column[j] = array[:,j].sum()

    chi2 = 0

    #using the calculation for the chi square
    for i in range(shape[0]):
        for j in range(shape[1]):
            eij = row[i]*column[j] / n
            oij = array[i,j]
            if eij==0.:
                chi2 += 0. #ensuring that Nan does not annoy us
            else:
                chi2 += math.pow((oij - eij),2) / float(eij)

    return chi2

#### Function to merge rows based on least chi square values

In [44]:
def perform_merge(df, feature):

    tdf = df[:-1]
    distinct_values = sorted(set(tdf['chi2']), reverse=False)

    col_names =  [feature,'Iris-setosa', 'Iris-versicolor',
                  'Iris-virginica','chi2']
    #new dataframe to return
    updated_df  = pd.DataFrame(columns = col_names)

    updated_df_index=0
    for index, row in df.iterrows(): #iterating over an existing dataframe
        if(index==0):
            updated_df.loc[len(updated_df)] = df.loc[index]
            updated_df_index+=1
        else:
            if df.loc[index - 1]['chi2']==distinct_values[0]: #merge
                updated_df.loc[updated_df_index-1]['Iris-setosa']+=df.loc[index]['Iris-setosa']
                updated_df.loc[updated_df_index-1]['Iris-versicolor']+=df.loc[index]['Iris-versicolor']
                updated_df.loc[updated_df_index-1]['Iris-virginica']+=df.loc[index]['Iris-virginica']
            else:
                updated_df.loc[len(updated_df)] = df.loc[index]
                updated_df_index+=1

    updated_df['chi2'] = 0.   #removing previous chi square values

    return updated_df


#### Initialization

In [45]:
if __name__=='__main__':
	for feature in ['sepal_length', 'sepal_width', 'petal_length','petal_width']:
		do_chimerge(feature=feature, data=data_iris, max_interval=6)



The split points for sepal_length are:
4.3
4.9
5.0
5.5
5.8
7.1
The final intervals for sepal_length are:
[4.3,4.8]
[4.9,4.9]
[5.0,5.4]
[5.5,5.7]
[5.8,7.0]
[7.1,7.9]
 
The split points for sepal_width are:
2.0
2.3
2.5
2.9
3.0
3.4
The final intervals for sepal_width are:
[2.0,2.2]
[2.3,2.4]
[2.5,2.8]
[2.9,2.9]
[3.0,3.3]
[3.4,4.4]
 
The split points for petal_length are:
1.0
3.0
4.5
4.8
5.0
5.2
The final intervals for petal_length are:
[1.0,1.9]
[3.0,4.4]
[4.5,4.7]
[4.8,4.9]
[5.0,5.1]
[5.2,6.9]
 
The split points for petal_width are:
0.1
1.0
1.4
1.7
1.8
1.9
The final intervals for petal_width are:
[0.1,0.6]
[1.0,1.3]
[1.4,1.6]
[1.7,1.7]
[1.8,1.8]
[1.9,2.5]
 
