**PROGRAM OVERVIEW**

The purpose of this program is to convert transposed document labels, along with their embeddings, into a .gdf (General Data Format) file. The .gdf file will serve as the input for creating a network analysis document using Gephi software.

More details - https://github.com/jphall663/corr_graph/blob/master/csv2gdf.ipynb

Import the required **Libraries** and **Packages**

In [1]:
#import re
#import time
#from IPython.display import Image # type: ignore
#from IPython.display import display # type: ignore
import numpy as np # type: ignore
import pandas as pd # type: ignore

Identify the features (labels)

In [2]:
# Set the directory path to the location where the .csv file containing the transposed embeddings is located.
df = pd.read_csv('/Users/arnabraychaudhari/Documents/6317/Project_LLM_and_RAG_2024_GWU/Farm_Bill_label_with_transposed_embeddings.csv')
X_1 = []


for col in df.columns:
    # Iterate from 1 to 358 (No.of columns in Tranposed Embeddings .csv - 1) and check for prefix in column names
    for i in range(1, 358):
        prefix = f'DocID_{i}_Label_'
        if col.startswith(prefix):
            # Remove the prefix and add the remaining part to X_1
            X_1.append(col.replace(prefix, ''))
            break

# Step 3: Remove duplicates to create X_2
X_2 = list(set(X_1))
num_elements_X_2 = len(X_2)
# Display the lists
#print("X_1 (with possible duplicates):", X_1)
print(f"Number of unique elements in X_2: {num_elements_X_2}")
print("X_2 (unique values):", X_2)

Number of unique elements in X_2: 154
X_2 (unique values): ['cobank', 'cotton', 'proposals', '–', 'land', 'urban', 'fs', 'reenrollment', 'usda', '-400', 'renewable', 'veterinary', 'programs', 'federal', 'livestock', 'hazard', 'broadband', 'fcic', 'similar', 'duration', 'easement', 'csp', 'specified', 'coverage', 'survey', 'payments', 'labeling', 'provides', 'ce', 'conservation', 'technical', 'title', 'state', 'grants', 'farm', 'saf', 'epa', 'crp', 'oak', 'biobased', 'functions', '”', 'million', 'fifra', 'easements', 'acres', 'billion', 'dairy', 'partnerships', 'requires', 'request', 'cbo', 'would', 'per', 'task', 'u.s.c', 'ohs', 'report', 'projects', 'income', 'microloans', 'forest', 'honey', 'acep', 'property', 'appropriations', 'hemp', 'practices', 'insurance', 'senate', '400', 'institutions', 'base', 'water', 'allows', 'administrative', 'defines', 'species', '•', 'tsp', 'commodities', 'ffp', 'annually', 'rac', 'ventures', 'use', 'sugar', '2024', 'hazardous', 'park', 'payment', 'nati

Define user-supplied constants

In [3]:
# absolute Pearson correlation threshold
# above which a pair of correlated variables is written to the gdf
CORR_THRESHOLD = 0.3 

# path at which to read input .csv
IN_FILE = '/Users/arnabraychaudhari/Documents/6317/Project_LLM_and_RAG_2024_GWU/Farm_Bill_label_with_transposed_embeddings.csv'

# path at which to write output .gdf
# WARNING: will be over-written!!
OUT_FILE = '/Users/arnabraychaudhari/Documents/6317/Project_LLM_and_RAG_2024_GWU/Farm_Bill_train.gdf'

# threshold for categorical levels 
# above which a variable will not be encoded 
# and written to the gdf
NUM_LEVELS_THRESHOLD = 25

# input variables (X_2) to be considered 
# for encoding and writing to gdf

Utility function for writing gdf from Pearson correlation matrix

In [4]:
def write_gdf(corr_frame):

    """ Writes a GDF suitable for use with Gephi: https://gephi.org/.
    
    Args:
        corr_frame: Pandas DataFrame of pair-wise Pearson correlations.
        
    Return: Path of written file.
    
    """
    
    with open(OUT_FILE, 'w+', encoding='utf-8') as out:

        # write node list
        out.write('nodedef>name VARCHAR,label VARCHAR\n')
        for i in range(0, corr_frame.shape[0]):
            out.write(str(i) + ',' + corr_frame.columns[i] + '\n')

        # write edge list
        # edge weight is absolute Pearson correlation
        out.write('edgedef>node1 VARCHAR,node2 VARCHAR, weight DOUBLE\n')
        for i in range(0, corr_frame.shape[0]):
            for j in range(0, corr_frame.shape[1]):
                if i > j:
                    ij_ = np.abs(corr_frame.iat[i, j])
                    if ij_ > CORR_THRESHOLD:
                        out.write(str(i) + ',' + str(j) + ',' + str(ij_) +
                                  '\n')

    return OUT_FILE

**MAIN ROUTINE**

The number of iterations need to be adjusted in accorance to the number of columns in transposed csv file

In [5]:
def csv2gdf():
    
    """ Encodes categorical variables, calculates Pearson correlations,
        and calls wite_gdf. """

    # read csv and keep inputs in X list
    frame = pd.read_csv(IN_FILE)
    rename_dict = {}

    # Step 2: Iterate over the column names
    for col in frame.columns:
    # Iterate from 1 to 358 (No.of columns in Tranposed Embeddings .csv - 1) and check for prefix in column names
        for i in range(1, 358):
            prefix = f'DocID_{i}_Label_'
            if col.startswith(prefix):
                # Remove the prefix and add the new column name to the rename_dict
                new_col_name = col.replace(prefix, '')
                rename_dict[col] = new_col_name
                break

    # Step 3: Rename the columns using the rename_dict
    frame = frame.rename(columns=rename_dict)
    frame = frame[X_2]
    # print("Updated column names:", frame.columns.tolist())
    
    # collect names of variables
    # to attempt to encode
    try_name_list = [name for name, type_ in frame[X_2].dtypes.items()
                     if type_ == 'object']

    print('Encoding categorical columns ...')
    
    # handle unary 
    # don't encode unary categorical columns
    unary_list = [name for name in try_name_list if
                  len(frame[name].unique()) == 1]  
    
    if len(unary_list) > 0:
        frame = frame.drop(unary_list, axis=1)
        try_name_list = list(set(try_name_list) - set(unary_list))
    
    # encode binary
    # don't create perfectly, negatively correlated encoded columns
    binary_list = [name for name in try_name_list if
                   len(frame[name].unique()) == 2] 
    
    if len(binary_list) > 0:
        dummies = pd.get_dummies(frame[binary_list], dummy_na=True,
                                 drop_first=True)
        frame = frame.drop(binary_list, axis=1)
        frame = pd.concat([frame, dummies], axis=1) 
        try_name_list = list(set(try_name_list) - set(binary_list))
    
    # encode nominal
    nominal_list = [name for name in try_name_list if
                    len(frame[name].unique()) <=
                    NUM_LEVELS_THRESHOLD and 
                    len(frame[name].unique()) > 2]

    if len(nominal_list) > 0:
        dummies = pd.get_dummies(frame[nominal_list], dummy_na=True)
        frame = frame.drop(nominal_list, axis=1)
        frame = pd.concat([frame, dummies], axis=1)
    
    print('Done.')

    # calculate Pearson correlations
    print('Calculating Pearson correlations ...')
    corr_frame = frame.corr()
    print('Done.')

    # write gdf
    print('Writing GDF file to %s ...' % write_gdf(corr_frame))
    print('Done.')

Execute the Main Routine

In [6]:
csv2gdf()

Encoding categorical columns ...
Done.
Calculating Pearson correlations ...
Done.
Writing GDF file to /Users/arnabraychaudhari/Documents/6317/Project_LLM_and_RAG_2024_GWU/Farm_Bill_train.gdf ...
Done.
