In [268]:
import pandas as pd

In [269]:
pd.set_option('display.max_rows', 500)

In [270]:
df = pd.read_csv("../Source/Clean/2025-08.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3120 entries, 0 to 3119
Data columns (total 54 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   manufacturer                      3120 non-null   object 
 1   name                              3120 non-null   object 
 2   gpu_name                          3120 non-null   object 
 3   generation                        3120 non-null   object 
 4   base_clock_mhz                    3120 non-null   float64
 5   boost_clock_mhz                   3120 non-null   float64
 6   architecture                      3092 non-null   object 
 7   foundry                           2995 non-null   object 
 8   process_size_nm                   3116 non-null   float64
 9   transistor_count_m                2877 non-null   float64
 10  transistor_density_k_mm2          2846 non-null   float64
 11  die_size_mm2                      2912 non-null   float64
 12  chip_p

In [271]:
nan_columns = df.columns[df.isnull().any()]

print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
Index(['architecture', 'foundry', 'process_size_nm', 'transistor_count_m',
       'transistor_density_k_mm2', 'die_size_mm2', 'chip_package',
       'release_date', 'bus_interface', 'memory_clock_mhz', 'memory_bus_bits',
       'memory_bandwidth_gb_s', 'thermal_design_power_w', 'board_length_mm',
       'board_width_mm', 'board_slot_width', 'suggested_psu_w',
       'power_connectors', 'display_connectors', 'directx_major_version',
       'directx_minor_version', 'opengl_major_version', 'opengl_minor_version',
       'vulkan_major_version', 'vulkan_minor_version', 'opencl_major_version',
       'opencl_minor_version', 'cuda_major_version', 'cuda_minor_version',
       'shader_model_major_version', 'shader_model_minor_version',
       'half_float_performance_gflop_s', 'single_float_performance_gflop_s',
       'double_float_performance_gflop_s'],
      dtype='object')


In [272]:
def get_association_len(df, parent, child):
    options = df.groupby([child])[parent].unique()
    count = df.groupby([child])[parent].nunique()
    
    association_df = pd.DataFrame(data={f'{parent}':options, 'count':count})
    association_df = association_df[association_df['count']>1]
    return association_df.shape[0]

In [273]:
def get_association_df(df, parent, child):
    options = df.groupby([child])[parent].unique()
    count = df.groupby([child])[parent].nunique()
    
    association_df = pd.DataFrame(data={f'{parent}':options, 'count':count})
    association_df = association_df[association_df['count']>1]
    return association_df

In [274]:
def get_association_counts(df, first, second):
    association_counts_df = pd.DataFrame({'column':[], f'{first}_count':[], f'{second}_count':[], 'contains_null':[]})

    for column in df.columns:
        if column == first or column == second:
            continue
        association_count1 = get_association_len(df, column, first)
        association_count2 = get_association_len(df, column, second)
        
        isnull = df[column].isnull().any()
        
        new_row_data = {'column':column, f'{first}_count': association_count1, f'{second}_count': association_count2, 'contains_null': isnull}
        association_counts_df.loc[len(association_counts_df)] = new_row_data
    return association_counts_df

In [275]:
def get_association_counts_generalized(df, target_columns):
    target_columns = [col for col in target_columns if col in df.columns]

    output_cols = ['column']
    count_cols = {}
    for target_col in target_columns:
        col_name = f'{target_col}_count'
        output_cols.append(col_name)
        count_cols[target_col] = col_name
    output_cols.append('contains_null')

    association_counts_list = []

    for column in df.columns:
        if column in target_columns:
            continue

        new_row_data = {'column': column}

        for target_col in target_columns:
            association_count = get_association_len(df, column, target_col)
            new_row_data[count_cols[target_col]] = association_count

        isnull = df[column].isnull().any()
        new_row_data['contains_null'] = isnull

        association_counts_list.append(new_row_data)

    association_counts_df = pd.DataFrame(association_counts_list, columns=output_cols)
    return association_counts_df

**Create associations against product names to check what's unique to product name**

In [276]:
get_association_counts_generalized(df, ['architecture', 'gpu_name', 'generation', 'manufacturer', 'name'])

Unnamed: 0,column,architecture_count,gpu_name_count,generation_count,manufacturer_count,name_count,contains_null
0,base_clock_mhz,69,319,420,8,139,False
1,boost_clock_mhz,76,354,448,8,161,False
2,foundry,14,2,51,5,10,True
3,process_size_nm,35,1,136,8,46,True
4,transistor_count_m,47,0,268,8,93,True
5,transistor_density_k_mm2,48,1,271,7,97,True
6,die_size_mm2,52,1,279,7,104,True
7,chip_package,26,0,155,4,54,True
8,release_date,75,327,379,8,139,True
9,bus_interface,57,170,194,7,36,True
