In [97]:
import pandas as pd
import numpy as np
import heapq
import pdb
from scipy.stats import chi2_contingency
from collections import defaultdict

In [98]:
df = pd.read_csv("tests/data-tiniest2/data.txt", delim_whitespace=True)

In [99]:
df.columns

Index(['target', 'num1', 'num2', 'num3'], dtype='object')

In [100]:
for col in df.columns:
    quartile_edges= np.percentile(df[col].values, [25,50,75,100],method='linear')

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  60 non-null     float64
 1   num1    60 non-null     float64
 2   num2    60 non-null     int64  
 3   num3    60 non-null     float64
dtypes: float64(3), int64(1)
memory usage: 2.0 KB


In [102]:
## Set up test case for particular node
NODE_2 = False
if NODE_2:
    old_df = df
    df = df.loc[df['num2'] <= 5.5]

In [103]:
## Variable selection
def _calc_chi2_stat(y_mean, col) -> np.float64:
    """ Split numeric into 4 quartiles, split categoricals into c bins
    Calculate chi2_contingency and return p-value """
    # @NOTE: can we use pvalues as is or do I need to use modified wilson-hilferty to 
    # get the chi-squared degree 1 and rank the variables like that? The 2002 regression
    # paper says to sort based on p-values but the tutorial video part 1 says to rank
    # based on the chi-squared degree of freedom 1. Note the video is 20 years older
    # than the paper.
    residuals = df.target - y_mean
    pvalue = 999.99

    # Convert the column to a NumPy array
    column_array = df[col].values
    indexes = df[col].index.values
    # Bin the quartiles
    quartile_edges = np.percentile(
        column_array, [25, 50, 75, 100], method='linear')
    # Bin the data using np.digitize
    quartile_bins = np.digitize(
        column_array, quartile_edges, right=True)
    # Create a defaultdict to store grouped indexes
    grouped_indexes = defaultdict(list)

    # Iterate through the bins and indexes arrays
    for bin_value, index in zip(quartile_bins, indexes):
        grouped_indexes[bin_value].append(index)

    grouped_index_keys = list(grouped_indexes.keys())
    num_groups = len(grouped_indexes.keys())
    chi_squared = np.zeros(shape=(2, num_groups))
    for _bin in range(0, num_groups):
        chi_squared[0, _bin] = (
            residuals[grouped_indexes[grouped_index_keys[_bin]]] >= 0).sum()
        chi_squared[1, _bin] = (
            residuals[grouped_indexes[grouped_index_keys[_bin]]] < 0).sum()
    print(chi_squared)
    statistic = chi2_contingency(chi_squared).statistic
    dof = chi2_contingency(chi_squared).dof
    print(f"statistic, dof = {statistic} {dof}")
    return statistic
            


In [104]:
## _get_best_variable function
split_vars = ['num1','num2','num3']
node_y_mean = df.target.mean()
residuals = df.target - node_y_mean

stat_pval = {
    col: _calc_chi2_stat(
        y_mean=node_y_mean,
        col=col) for col in split_vars}
# numerical val          |  0   0.25% | 0.25 to 0.50 | 0.50 to 0.75 | 0.75 - 1.0 |
#                   pos
#                   neg

# categorical val
#                        |   cat1     |   cat2       |    cat3      |   NA       | ... etc
#                   pos
#                   neg
top_3_keys = {key: value for key, value in stat_pval.items(
) if value in heapq.nsmallest(3, stat_pval.values())}
top_3_keys = sorted(top_3_keys.items(), key=lambda x: x[1], reverse=True)


[[ 1.  5. 10. 15.]
 [14. 10.  5.  0.]]
statistic, dof = 29.566184649610676 3
[[ 9.  5. 12.  5.]
 [ 7.  9.  2. 11.]]
statistic, dof = 10.730970920069918 3
[[8. 7. 8. 8.]
 [8. 9. 6. 6.]]
statistic, dof = 0.7556014619418404 3


In [86]:
"""
Top-ranked variables and 1-df chi-squared values at root node
      1  0.2245E+02   num1
      2  0.6132E+01   num2
      3  0.3108E-01   num3
"""

top_3_keys

[('num1', 29.566184649610676),
 ('num2', 10.730970920069918),
 ('num3', 0.7556014619418404)]

In [75]:
split_var = top_3_keys[0][0]
col = split_var
x_uniq = df[col].drop_duplicates().sort_values()
cutpoints = x_uniq[:-1] + np.diff(x_uniq)/2


In [76]:
"""
greatest_tot_sse = None
node_sse = ((df.target - df.target.mean())**2).sum()
best_cut = None

for cut in cutpoints:
    right_idx = df[df[col] > cut].index
    left_idx = df.drop(right_idx, axis=0).index
    left_mean = df.loc[left_idx].target.mean()
    right_mean = df.loc[right_idx].target.mean()
    tot_items = len(left_idx) + len(right_idx)
    #left_sse = ((df.loc[left_idx].target - left_mean)**2).sum()
    #right_sse = ((df.loc[right_idx].target - right_mean)**2).sum()
    nAL = len(left_idx)
    nAR = len(right_idx)
    cut_sse = (nAL * nAR / tot_items) * (left_mean - right_mean)**2
    #weights = 1, len(left_idx) / tot_items, len(right_idx) / tot_items
    #cut_sse = weights[0]*node_sse - weights[1]*left_sse - weights[2]*right_sse
    print(cut, cut_sse)
    if greatest_tot_sse == None or cut_sse > greatest_tot_sse:
        greatest_tot_sse = cut_sse
        best_cut = cut
"""
        

'\ngreatest_tot_sse = None\nnode_sse = ((df.target - df.target.mean())**2).sum()\nbest_cut = None\n\nfor cut in cutpoints:\n    right_idx = df[df[col] > cut].index\n    left_idx = df.drop(right_idx, axis=0).index\n    left_mean = df.loc[left_idx].target.mean()\n    right_mean = df.loc[right_idx].target.mean()\n    tot_items = len(left_idx) + len(right_idx)\n    #left_sse = ((df.loc[left_idx].target - left_mean)**2).sum()\n    #right_sse = ((df.loc[right_idx].target - right_mean)**2).sum()\n    nAL = len(left_idx)\n    nAR = len(right_idx)\n    cut_sse = (nAL * nAR / tot_items) * (left_mean - right_mean)**2\n    #weights = 1, len(left_idx) / tot_items, len(right_idx) / tot_items\n    #cut_sse = weights[0]*node_sse - weights[1]*left_sse - weights[2]*right_sse\n    print(cut, cut_sse)\n    if greatest_tot_sse == None or cut_sse > greatest_tot_sse:\n        greatest_tot_sse = cut_sse\n        best_cut = cut\n'