# Categorical feature selection by comparing Chi-squared statistic across every pair of categorical vars

In [53]:
# Autoreload allows you to refresh methods from imports without re-running this cell.
%load_ext autoreload
%autoreload 2

# Import the required functions from hello_world.py.
from eda_helpers import col_details

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
import numpy as np
import pandas as pd
from typing import List
from scipy.stats import chi2_contingency

def test_table(feature1: str, feature2: str, df: pd.DataFrame) -> List[List[float]]:
    '''
    Generates a sub-table with categories of feature1 as rows and feature 2 as cols
    '''
    # Create a contingency table
    contingency_table = pd.crosstab(df[feature1], df[feature2])

    # Get the observed frequencies
    observed_frequencies = contingency_table.values

    # Calculate the chi-squared statistic
    chi2_stat, p_val, dof, expected = chi2_contingency(observed_frequencies)

    # Return the chi-squared statistic
    return p_val
    

In [55]:
df = pd.read_csv('stack-overflow-developer-survey-2024/survey_results_public.csv')
col_details(df)


- ResponseId: int64
- MainBranch: object (5 unique values)
  - I am a developer by profession (str): 50207
  - I am not primarily a developer, but I write code sometimes as part of my work/studies (str): 6511
  - I am learning to code (str): 3875
  - I code primarily as a hobby (str): 3334
  - I used to be a developer by profession, but no longer am (str): 1510
- Age: object (8 unique values)
  - 25-34 years old (str): 23911
  - 35-44 years old (str): 14942
  - 18-24 years old (str): 14098
  - 45-54 years old (str): 6249
  - 55-64 years old (str): 2575
  - Under 18 years old (str): 2568
  - 65 years or older (str): 772
  - Prefer not to say (str): 322
- Employment: object (110 unique values)
  - Employed, full-time (str): 39041
  - Independent contractor, freelancer, or self-employed (str): 4846
  - Student, full-time (str): 4709
  - Employed, full-time;Independent contractor, freelancer, or self-employed (str): 3557
  - Not employed, but looking for work (str): 2341
  - Employed, part

In [56]:
cat_cols = [col for col in df.columns if df[col].dtype in ['object', 'category']]
print(len(cat_cols))
print(cat_cols)

100
['MainBranch', 'Age', 'Employment', 'RemoteWork', 'Check', 'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline', 'TechDoc', 'YearsCode', 'YearsCodePro', 'DevType', 'OrgSize', 'PurchaseInfluence', 'BuyNewTool', 'BuildvsBuy', 'TechEndorse', 'Country', 'Currency', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith', 'LanguageAdmired', 'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith', 'DatabaseAdmired', 'PlatformHaveWorkedWith', 'PlatformWantToWorkWith', 'PlatformAdmired', 'WebframeHaveWorkedWith', 'WebframeWantToWorkWith', 'WebframeAdmired', 'EmbeddedHaveWorkedWith', 'EmbeddedWantToWorkWith', 'EmbeddedAdmired', 'MiscTechHaveWorkedWith', 'MiscTechWantToWorkWith', 'MiscTechAdmired', 'ToolsTechHaveWorkedWith', 'ToolsTechWantToWorkWith', 'ToolsTechAdmired', 'NEWCollabToolsHaveWorkedWith', 'NEWCollabToolsWantToWorkWith', 'NEWCollabToolsAdmired', 'OpSysPersonal use', 'OpSysProfessional use', 'OfficeStackAsyncHaveWorkedWith', 'OfficeStackAsyncWantToWorkWith', 'OfficeStackAsyncAdmir

## Run this on a subset of the columns

In [81]:
from collections import Counter
import pprint
correlated_counts = Counter()


n = 10
# n = len(cat_cols)
correlation_matrix = np.zeros((n, n))

enumerated_cols = list(enumerate(cat_cols))[:n]
for i in range(len(enumerated_cols)):
    for j in range(i+1, len(enumerated_cols)):
        col1 = enumerated_cols[i][1]
        col2 = enumerated_cols[j][1]
        chi2_p = test_table(col1, col2, df)
        if chi2_p < 0.05:
            correlated_counts[col1] += 1
            correlated_counts[col2] += 1
        correlation_likelihood = np.float64(np.float64(1) - chi2_p)
        correlation_matrix[i, j] = correlation_likelihood

fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix,
                   x=list(map(lambda x: x[1], enumerated_cols)),
                   y=list(map(lambda x: x[1], enumerated_cols)),
                   colorscale='Viridis'))

fig.update_layout(
    title_text='Correlation Matrix',
    xaxis_nticks=36,
    yaxis_nticks=36,
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ),
    annotations=[
        dict(
            text=round(correlation_matrix[i][j], 2),
            x=list(map(lambda x: x[1], enumerated_cols))[j],
            y=list(map(lambda x: x[1], enumerated_cols))[i],
            showarrow=False,
            font_size=14
        ) for i in range(n) for j in range(n)
    ]
)

fig.show()

# Display categorical variables that are closely correlated with the most number of other variables
k = 20
sorted_correlated_counts = sorted(list(correlated_counts.items()), key=lambda x: x[1], reverse=True)
print(f'Displaying the {k} categorical features that are the most closely correlated with other categorical features')
print(f'<FeatureName>, <CountOtherCorrelatedFeatures>')
pprint.pprint(sorted_correlated_counts[:k])

Displaying the 20 categorical features that are the most closely correlated with other categorical features
<FeatureName>, <CountOtherCorrelatedFeatures>
[('MainBranch', 8),
 ('Age', 8),
 ('Employment', 8),
 ('CodingActivities', 8),
 ('EdLevel', 8),
 ('LearnCode', 8),
 ('TechDoc', 8),
 ('RemoteWork', 7),
 ('LearnCodeOnline', 7)]


## Run this on all the columns

In [85]:
from collections import Counter
import pprint
correlated_counts = Counter()


# n = 20
n = len(cat_cols)
correlation_matrix = np.zeros((n, n))

enumerated_cols = list(enumerate(cat_cols))[:n]
for i in range(len(enumerated_cols)):
    for j in range(i+1, len(enumerated_cols)):
        col1 = enumerated_cols[i][1]
        col2 = enumerated_cols[j][1]
        chi2_p = test_table(col1, col2, df)
        if chi2_p < 0.05:
            correlated_counts[col1] += 1
            correlated_counts[col2] += 1
        correlation_likelihood = np.float64(np.float64(1) - chi2_p)
        correlation_matrix[i, j] = correlation_likelihood

fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix,
                   x=list(map(lambda x: x[1], enumerated_cols)),
                   y=list(map(lambda x: x[1], enumerated_cols)),
                   colorscale='Viridis'))

fig.update_layout(
    title_text='Correlation Matrix',
    xaxis_nticks=36,
    yaxis_nticks=36,
    font=dict(
        family="Courier New, monospace",
        size=4,
        color="RebeccaPurple"
    ),
    annotations=[
        dict(
            text=round(correlation_matrix[i][j], 2),
            x=list(map(lambda x: x[1], enumerated_cols))[j],
            y=list(map(lambda x: x[1], enumerated_cols))[i],
            showarrow=False,
            font_size=4
        ) for i in range(n) for j in range(n)
    ]
)

fig.show()

# Display categorical variables that are closely correlated with the most number of other variables
k = 20
sorted_correlated_counts = sorted(list(correlated_counts.items()), key=lambda x: x[1], reverse=True)
print(f'Displaying the {k} categorical features that are the most closely correlated with other categorical features')
print(f'<FeatureName>, <CountOtherCorrelatedFeatures>')
pprint.pprint(sorted_correlated_counts[:k])

Displaying the 20 categorical features that are the most closely correlated with other categorical features
<FeatureName>, <CountOtherCorrelatedFeatures>
[('CodingActivities', 97),
 ('LearnCode', 97),
 ('ProfessionalQuestion', 97),
 ('Employment', 96),
 ('DevType', 96),
 ('TechEndorse', 96),
 ('Country', 96),
 ('OpSysPersonal use', 96),
 ('OpSysProfessional use', 96),
 ('NEWSOSites', 96),
 ('SOHow', 96),
 ('AIComplex', 96),
 ('BuyNewTool', 95),
 ('OfficeStackSyncHaveWorkedWith', 95),
 ('OfficeStackSyncWantToWorkWith', 95),
 ('AIBen', 95),
 ('AIChallenges', 95),
 ('SurveyLength', 95),
 ('AIAcc', 94),
 ('RemoteWork', 93)]


In [92]:
print(f'Total number of categorical features: {n}')
print(f'There are {len(sorted_correlated_counts)} features that are correlated with at least one other feature')
counts = [x[1] for x in sorted_correlated_counts]
mean = np.mean(counts) if counts else None
median = np.median(counts) if counts else None
q1 = np.percentile(counts, 25) if counts else None
q3 = np.percentile(counts, 75) if counts else None
iqr = q3 - q1 if q1 is not None and q3 is not None else None

fig = go.Figure(data=[go.Histogram(x=counts, nbinsx=10)])
fig.update_layout(
    title_text='Histogram of Correlated Feature Counts',
    xaxis_title_text='Count of Other Correlated Features',
    yaxis_title_text='Frequency'
)
print(f'Statistics for the number of features a given feature is correlated with:')
if mean is not None:
    print(f"Mean: {mean}")
if median is not None:
    print(f"Median: {median}")
if q1 is not None and q3 is not None:
    print(f"IQR: {q1} to {q3}")
fig.show()


Total number of categorical features: 100
There are 99 features that are correlated with at least one other feature
Statistics for the number of features a given feature is correlated with:
Mean: 82.60606060606061
Median: 85.0
IQR: 75.0 to 91.0
