# 1. Install libraries a/n

In [148]:
# Install required libraries (uncomment and run if not already installed)
# !pip install pandas neo4j
# !pip install beautifulsoup4 lxml 
# !pip install requests tqdm
# !pip install html5lib

# Import necessary libraries
import pandas as pd
from neo4j import GraphDatabase
import os
import sys

# Dynamically add the src directory to sys.path
src_path = os.path.abspath(os.path.join(os.getcwd(), "../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

# Now, import the modules
from web_data_extraction import create_robust_session, fetch_url
from progress_tracker import track_progress
import pandas as pd
import json

print("Modules and dependencies loaded successfully.")


Modules and dependencies loaded successfully.


## 2. Create Table of all unique survey IDs
- The first step is to create our survey table
- This will be the 'Central Table' for organizing all of our information

### 2.1 Loading Census API Dataset info
- File was obtained from `https://api.census.gov/data.html`

In [122]:
# Import required libraries
from bs4 import BeautifulSoup
import pandas as pd

# Specify the path to your HTML file
html_file_path = './data/CensusDataAPI_data.html'  # Update this path as needed

# Define the columns that contain URLs
url_columns = [
    'Geography List',
    'Variable List',
    'Group List',
    'SortList',
    'Examples',
    'Developer Documentation',
    'API Base URL'
]

# Load and parse the HTML file
with open(html_file_path, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'lxml')

# Find all tables in the HTML
tables = soup.find_all('table')
print(f"Number of tables found: {len(tables)}")

# Check if at least one table is found
if not tables:
    raise ValueError("No tables found in the HTML file.")

# Assuming your data is in the first table; adjust the index if necessary
table = tables[0]

# Extract table headers
headers = [th.get_text(strip=True) for th in table.find_all('th')]
print(f"Headers: {headers}")

# Initialize a list to store each row's data
rows = []

# Iterate over each row in the table (skip header row)
for tr in table.find_all('tr')[1:]:
    cells = tr.find_all(['td', 'th'])
    row_data = {}
    for idx, cell in enumerate(cells):
        # Get the header for the current cell
        header = headers[idx] if idx < len(headers) else f'Column_{idx+1}'

        if header in url_columns:
            # Extract all href attributes from <a> tags
            links = cell.find_all('a')
            urls = [link.get('href') for link in links if link.get('href')]

            # If no <a> tags, check if the cell contains a plain URL
            if not urls:
                cell_text = cell.get_text(strip=True)
                if cell_text.startswith('http://') or cell_text.startswith('https://'):
                    urls = [cell_text]

            # Join multiple URLs with '; ' or set as None if no URLs found
            row_data[header] = '; '.join(urls) if urls else None
        else:
            # For other columns, store the text
            row_data[header] = cell.get_text(strip=True)
    rows.append(row_data)

# Create a DataFrame from the extracted data
df_census = pd.DataFrame(rows)

# Display the first few rows of the DataFrame
df_census.head()


Number of tables found: 1
Headers: ['Title', 'Description', 'Vintage', 'Dataset Name', 'Dataset Type', 'Geography List', 'Variable List', 'Group List', 'SortList', 'Examples', 'Developer Documentation', 'API Base URL']


Unnamed: 0,Title,Description,Vintage,Dataset Name,Dataset Type,Geography List,Variable List,Group List,SortList,Examples,Developer Documentation,API Base URL
0,1648 datasets,,,,,,,,,,,
1,1986 County Business Patterns: Business Patterns,County Business Patterns (CBP) is an annual se...,1986.0,cbp,Aggregate,http://api.census.gov/data/1986/cbp/geography....,http://api.census.gov/data/1986/cbp/variables....,http://api.census.gov/data/1986/cbp/groups.html,http://api.census.gov/data/1986/cbp/sorts.html,http://api.census.gov/data/1986/cbp/examples.html,http://www.census.gov/developer/,http://api.census.gov/data/1986/cbp
2,1987 County Business Patterns: Business Patterns,County Business Patterns (CBP) is an annual se...,1987.0,cbp,Aggregate,http://api.census.gov/data/1987/cbp/geography....,http://api.census.gov/data/1987/cbp/variables....,http://api.census.gov/data/1987/cbp/groups.html,http://api.census.gov/data/1987/cbp/sorts.html,http://api.census.gov/data/1987/cbp/examples.html,http://www.census.gov/developer/,http://api.census.gov/data/1987/cbp
3,1988 County Business Patterns: Business Patterns,County Business Patterns (CBP) is an annual se...,1988.0,cbp,Aggregate,http://api.census.gov/data/1988/cbp/geography....,http://api.census.gov/data/1988/cbp/variables....,http://api.census.gov/data/1988/cbp/groups.html,http://api.census.gov/data/1988/cbp/sorts.html,http://api.census.gov/data/1988/cbp/examples.html,http://www.census.gov/developer/,http://api.census.gov/data/1988/cbp
4,1989 County Business Patterns: Business Patterns,County Business Patterns (CBP) is an annual se...,1989.0,cbp,Aggregate,http://api.census.gov/data/1989/cbp/geography....,http://api.census.gov/data/1989/cbp/variables....,http://api.census.gov/data/1989/cbp/groups.html,http://api.census.gov/data/1989/cbp/sorts.html,http://api.census.gov/data/1989/cbp/examples.html,http://www.census.gov/developer/,http://api.census.gov/data/1989/cbp


### 2.2 Parsing data set name column
- There is hierarchial information embedded in this column, potenially use for the graph database createion
- However, just in case, we're going to extract that, and grab the 'month' as time data we may want to use as a relationship.

In [123]:
# Define the delimiter used in the "Dataset Name" column
delimiter = '›'

# Split the "Dataset Name" into hierarchical levels, maximum of 3 splits (4 parts)
hierarchy_split = df_census['Dataset Name'].str.split(delimiter, n=3, expand=True)

# Rename the new columns based on hierarchy levels
hierarchy_split = hierarchy_split.rename(columns={
    0: 'Survey',
    1: 'Subtype1',
    2: 'Subtype2',
    3: 'Subtype3'
})

# Concatenate the new hierarchy columns with the original DataFrame
df_census = pd.concat([df_census, hierarchy_split], axis=1)

# Display the first few rows after parsing
df_census[['Survey', 'Subtype1', 'Subtype2', 'Subtype3', 'Vintage']].head(7)


Unnamed: 0,Survey,Subtype1,Subtype2,Subtype3,Vintage
0,,,,,
1,cbp,,,,1986.0
2,cbp,,,,1987.0
3,cbp,,,,1988.0
4,cbp,,,,1989.0
5,cps,basic,apr,,1989.0
6,cps,basic,aug,,1989.0


### 2.3 Extract Month from subtype columns

In [124]:
# Define a list of month abbreviations for identification (lowercase for matching)
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
          'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

# Function to identify and extract month from subtype columns
def extract_month(row):
    for subtype_col in ['Subtype1', 'Subtype2', 'Subtype3']:
        if pd.notnull(row[subtype_col]):
            if row[subtype_col].strip().lower() in months:
                return row[subtype_col].strip().capitalize()
    return None

# Apply the function to create a new 'Month' column
df_census['Month'] = df_census.apply(extract_month, axis=1)

# Remove the month from the subtype columns to avoid duplication
for subtype_col in ['Subtype1', 'Subtype2', 'Subtype3']:
    df_census[subtype_col] = df_census[subtype_col].apply(
        lambda x: None if pd.notnull(x) and x.strip().lower() in months else x
    )

# Display the first few rows after extracting 'Month'
df_census[['Survey', 'Subtype1', 'Subtype2', 'Subtype3', 'Month', 'Vintage']].head()


Unnamed: 0,Survey,Subtype1,Subtype2,Subtype3,Month,Vintage
0,,,,,,
1,cbp,,,,,1986.0
2,cbp,,,,,1987.0
3,cbp,,,,,1988.0
4,cbp,,,,,1989.0


### 2.4 Get a unique ID for each row
1. Use the identifier from the API URL which is a JSON file that contains a unique identifier (aka KEY) for each dataset.
2. Why Use identifier as the Key:
- Uniqueness: The identifier provides a unique reference for each dataset, ensuring there are no duplicates.
- Consistency: Using a standardized key helps in linking data across different sources and maintaining data integrity within your knowledge graph.
- Efficiency: It simplifies data retrieval and relationships within the knowledge graph.

#### 2.4.1 First, drop the first row contains the number of records and a bunch of na values

In [133]:
# Removeing first row where all elements are NaN
# Before removing first row
print("Before dropping row:")
print(df_census.head(1))

# Removing first row
df_census = df_census.drop(0).reset_index(drop=True)

# After removing first row
print("\nAfter dropping row:")
print(df_census.head(1))

Before dropping row:
           Title Description Vintage Dataset Name Dataset Type Geography List  \
0  1648 datasets         NaN     NaN          NaN          NaN            NaN   

  Variable List Group List SortList Examples Developer Documentation  \
0           NaN        NaN      NaN      NaN                     NaN   

  API Base URL Survey Subtype1 Subtype2 Subtype3 Month identifier id_name  
0          NaN    NaN      NaN      NaN      NaN  None       None    None  

After dropping row:
                                              Title  \
0  1986 County Business Patterns: Business Patterns   

                                         Description Vintage Dataset Name  \
0  County Business Patterns (CBP) is an annual se...    1986          cbp   

  Dataset Type                                     Geography List  \
0    Aggregate  http://api.census.gov/data/1986/cbp/geography....   

                                       Variable List  \
0  http://api.census.gov/data/1986/cb

#### 2.4.2 Extracting the Identifier Field to create key column SurveyID
- Robust error handling was added do account for network issues

In [135]:
# Import required libraries
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd
from tqdm import tqdm  # For progress bar

# Remove rows where all elements are NaN
df_census.dropna(how='all', inplace=True)
df_census.reset_index(drop=True, inplace=True)

# Display current DataFrame columns to confirm
print("Current DataFrame Columns:")
print(df_census.columns.tolist())

# Step 1: Set up a session with retries (move this outside the function)
session = requests.Session()
retries = Retry(
    total=5,  # Total number of retries
    backoff_factor=1,  # Time to wait between retries (exponential backoff)
    status_forcelist=[500, 502, 503, 504],  # Retry on these HTTP status codes
    allowed_methods=['GET']  # Use 'allowed_methods' instead of 'method_whitelist'
)
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Step 2: Define a Function to Extract 'identifier' and 'SurveyID' from a Single URL with Retries

def extract_identifier(url, session):
    """
    Fetches JSON data from the given URL and extracts the 'identifier' and 'SurveyID'.
    
    Parameters:
        url (str): The API Base URL pointing to the JSON file.
        session (requests.Session): The session object with retry strategy.
    
    Returns:
        tuple: (identifier, SurveyID) where:
               - identifier (str or None): The full identifier URL.
               - SurveyID (str or None): The part of the identifier after '/id/'.
    """
    try:
        # Ensure the URL is a valid string
        if not isinstance(url, str) or pd.isna(url):
            print("Invalid URL encountered.")
            return None, None

        # Fetch the JSON data from the URL
        response = session.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
        json_data = response.json()
        
        # Extract the 'identifier' field from the first dataset
        dataset_list = json_data.get('dataset', [])
        if isinstance(dataset_list, list) and len(dataset_list) > 0:
            identifier = dataset_list[0].get('identifier', None)
        else:
            identifier = None
        
        # Extract the 'SurveyID' by splitting the 'identifier' at '/id/'
        if identifier and '/id/' in identifier:
            SurveyID = identifier.split('/id/')[-1]
        else:
            SurveyID = None
        
        return identifier, SurveyID
    
    except requests.exceptions.Timeout:
        print(f"Timeout error occurred while fetching {url}")
        return None, None
    except requests.exceptions.ConnectionError:
        print(f"Connection error occurred while fetching {url}")
        return None, None
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred while fetching {url}: {e}")
        return None, None
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching {url}: {e}")
        return None, None
    except Exception as e:
        print(f"An unexpected error occurred while processing {url}: {e}")
        return None, None

# Step 3: Apply the Extraction Function to All Rows

# Initialize lists to store the results
identifiers = []
survey_ids = []

# Iterate over each API Base URL and extract identifiers
for url in tqdm(df_census['API Base URL'], desc="Fetching identifiers"):
    identifier, SurveyID = extract_identifier(url, session)
    identifiers.append(identifier)
    survey_ids.append(SurveyID)

# Step 4: Add the Extracted Data to the DataFrame

df_census['identifier'] = identifiers
df_census['SurveyID'] = survey_ids

# Display the first few rows to verify the extraction
print("\nAfter extracting 'identifier' and 'SurveyID':")
print(df_census[['Survey', 'Subtype1', 'Subtype2', 'Subtype3', 'Vintage', 'identifier', 'SurveyID']].head())

# Step 5: Handle Missing Identifiers

# Check for missing identifiers
missing_identifiers = df_census['identifier'].isnull().sum()
print(f"\nNumber of missing identifiers: {missing_identifiers}")

# If there are missing identifiers, display them and save to a separate CSV for manual correction
if missing_identifiers > 0:
    print("\nRows with missing identifiers:")
    missing_identifiers_df = df_census[df_census['identifier'].isnull()]
    print(missing_identifiers_df[['Survey', 'API Base URL']].head())
    
    # Save these rows to a separate CSV file
    missing_identifiers_df.to_csv('./data/missing_identifiers.csv', index=False)
    print("\nSaved rows with missing identifiers to 'missing_identifiers.csv' for manual correction.")
else:
    print("All identifiers were successfully extracted.")

# Step 6: Save the Updated DataFrame for Backup

# Save the updated DataFrame to a CSV file
output_file = './data/df_census_with_identifiers.csv'
df_census.to_csv(output_file, index=False)
print(f"\nUpdated DataFrame with 'identifier' and 'SurveyID' successfully saved to {output_file}")


Current DataFrame Columns:
['Title', 'Description', 'Vintage', 'Dataset Name', 'Dataset Type', 'Geography List', 'Variable List', 'Group List', 'SortList', 'Examples', 'Developer Documentation', 'API Base URL', 'Survey', 'Subtype1', 'Subtype2', 'Subtype3', 'Month', 'identifier', 'id_name']


Fetching identifiers: 100%|█████████████████| 1648/1648 [02:27<00:00, 11.17it/s]



After extracting 'identifier' and 'SurveyID':
  Survey Subtype1 Subtype2 Subtype3 Vintage  \
0    cbp     None     None     None    1986   
1    cbp     None     None     None    1987   
2    cbp     None     None     None    1988   
3    cbp     None     None     None    1989   
4    cps    basic     None     None    1989   

                                      identifier        SurveyID  
0          http://api.census.gov/data/id/CBP1986         CBP1986  
1          http://api.census.gov/data/id/CBP1987         CBP1987  
2          http://api.census.gov/data/id/CBP1988         CBP1988  
3          http://api.census.gov/data/id/CBP1989         CBP1989  
4  https://api.census.gov/data/id/CPSBASIC198904  CPSBASIC198904  

Number of missing identifiers: 0
All identifiers were successfully extracted.

Updated DataFrame with 'identifier' and 'SurveyID' successfully saved to ./data/df_census_with_identifiers.csv


#### 2.4.3 Manually fixing errors (Only if needed)
- Recommended: Rerun! Maybe wait if there is a network issue.
- Manualy fixing is an acceptable strategy, but prone to human error.
- robust error handling was added to avoid errors, but stuff happens.


## 3. Create External Tables and Linkages for Survey Table
1. Several columns have URLs that link to other tables: 'Geography List', 'Variable List', 'Group List', 'SortList', 'Examples'
2. In order to avoid issues with data scraping, collect a copy of the data with linkages back to SurveyID
3. Variables and Group have more complex structures that will require additional processing to capture the information
4. 'SortList' is believed to be a parameter and probably doesn't have any info, but we'll look anyways

> We will programatically create the graph database from these tables. The CSV files are an intermediary step, but one done out of practical necessity to avoid retrieval issues and create a solid foundation to work from with all data local to the compute.

## 3.1 Process 'Examples' Column
- This contains example API calls to use this survey
- For building the graph databases later, we will document if the relationship exists (Has Example) ...
- If there is no relationship, we won't waste cycles trying to look or retrieve anything.


In [36]:
import os
import sys
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import warnings
import re
from progress_tracker import track_progress
from web_data_extraction import create_robust_session, fetch_url

# Suppress FutureWarning for pd.read_html
warnings.filterwarnings("ignore", category=FutureWarning, message="Passing literal html to 'read_html' is deprecated")

def is_summary_row(row):
    """
    Check if a row is a summary row (contains count of items/examples).
    
    Parameters:
    -----------
    row : pandas.Series
        A row from the table
        
    Returns:
    --------
    bool
        True if the row is a summary row
    """
    row_str = ' '.join(str(val).lower() for val in row.values)
    summary_patterns = [
        r'\d+\s*items?',
        r'\d+\s*examples?',
        r'\d+\s*groups?',
        'n/a',
        'total'
    ]
    return any(re.search(pattern, row_str.lower()) for pattern in summary_patterns)

def clean_table(df):
    """
    Clean a table by removing summary rows and empty rows.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The table to clean
        
    Returns:
    --------
    pandas.DataFrame
        Cleaned table
    """
    if df.empty:
        return df
        
    # Remove summary rows
    df = df[~df.apply(is_summary_row, axis=1)]
    
    # Remove rows where all values are NA or empty
    df = df.dropna(how='all')
    df = df[~df.apply(lambda x: x.astype(str).str.strip().eq('').all(), axis=1)]
    
    return df

def process_example_url(session, url, survey_id):
    """
    Process a single Examples URL to extract its data.
    """
    try:
        # Check for invalid URL
        if pd.isna(url) or not isinstance(url, str):
            return "No Example", None
            
        # Fetch URL content
        html_content = fetch_url(url, session)
        if not html_content:
            return "No Example", None
            
        # Parse HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Try to find tables
        tables = pd.read_html(StringIO(str(soup)))
        
        if not tables:
            return "No Example", None
            
        # Process the first table
        example_table = tables[0]
        
        # Clean column names
        example_table.columns = example_table.columns.str.strip()
        
        # Clean the table
        clean_df = clean_table(example_table)
        
        if clean_df.empty:
            return "No Example", None
            
        # Convert to records
        examples = []
        for _, row in clean_df.iterrows():
            example_dict = row.to_dict()
            example_dict['SurveyID'] = survey_id
            examples.append(example_dict)
            
        return "Has Example", examples
        
    except Exception as e:
        print(f"Error processing URL {url}: {str(e)}")
        return "No Example", None

def main():
    try:
        # Load survey data
        survey_df = pd.read_csv('../data/data_extraction/df_census_with_identifiers.csv')
        
        # Initialize session and examples data list
        session = create_robust_session()
        examples_data = []
        
        # Process each row
        for index, row in track_progress(survey_df.iterrows(), description="Processing Examples", total=survey_df.shape[0]):
            try:
                survey_id = row['id_name']
                example_url = row['Examples']
                
                # Process the URL
                has_example, example_list = process_example_url(session, example_url, survey_id)
                
                # Update survey DataFrame
                survey_df.at[index, 'Has Example'] = has_example
                
                # Add valid examples to the list
                if example_list:
                    examples_data.extend(example_list)
                    
            except Exception as e:
                print(f"Error processing row {index}: {str(e)}")
                continue
        
        # Save updated survey data
        survey_df.to_csv('../data/data_extraction/SurveyNode.csv', index=False)
        
        # Save examples data if we have any
        if examples_data:
            examples_df = pd.DataFrame(examples_data)
            examples_df.to_csv('../data/data_extraction/ExamplesNode.csv', index=False)
        
        print("Data extraction completed successfully!")
        print(f"Total rows processed: {len(survey_df)}")
        print(f"Total examples found: {len(examples_data)}")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Processing Examples: 100%|███████████████| 1648/1648 [04:20<00:00,  6.32items/s]


Data extraction completed successfully!
Total rows processed: 1648
Total examples found: 22649


## 3.1 Process 'SortList' Column
- Same strategy as above.


In [37]:
import os
import sys
import pandas as pd
from progress_tracker import track_progress
from web_data_extraction import create_robust_session, fetch_url

# Dynamically add the src directory to sys.path
src_path = os.path.abspath(os.path.join(os.getcwd(), "../src"))
if src_path not in sys.path:
    sys.path.append(src_path)

def load_dataframe(file_path):
    """Load a CSV file into a pandas DataFrame."""
    return pd.read_csv(file_path)

def save_dataframe(df, file_path):
    """Save a pandas DataFrame to a CSV file."""
    df.to_csv(file_path, index=False)

def is_empty_table(table):
    """
    Check if a table is effectively empty (contains only N/A or no meaningful data).
    
    Parameters:
    -----------
    table : pandas.DataFrame
        The table to check
        
    Returns:
    --------
    bool
        True if the table is empty or contains only N/A values
    """
    if table.empty:
        return True
        
    # Check if all values are N/A or empty
    is_na = table.apply(lambda x: x.astype(str).str.strip().isin(['N/A', '', 'nan']).all())
    if is_na.all():
        return True
        
    # Check for "0 groups" marker
    has_zero_groups = table.apply(lambda x: x.astype(str).str.contains('0 groups', case=False)).any().any()
    return has_zero_groups

def process_sort_list(row):
    """
    Process the SortList column to determine if it has meaningful data.
    
    Parameters:
    -----------
    row : pandas.Series
        A single row of the DataFrame
        
    Returns:
    --------
    tuple
        ("Has Sort"/"No Sort", list of nodes or None)
    """
    sort_list_url = row.get("SortList")
    survey_id = row.get("SurveyID")
    
    # Early return for invalid URLs
    if pd.isna(sort_list_url) or not isinstance(sort_list_url, str):
        return "No Sort", None
    
    try:
        # Fetch URL content
        session = create_robust_session()
        html_content = fetch_url(sort_list_url, session)
        
        if not html_content:
            return "No Sort", None
            
        # Parse tables
        tables = pd.read_html(html_content)
        
        if not tables:
            return "No Sort", None
            
        sort_table = tables[0]
        
        # Check if table is effectively empty
        if is_empty_table(sort_table):
            return "No Sort", None
        
        # Clean column names
        sort_table.columns = sort_table.columns.str.strip()
        
        # Verify required columns exist
        required_columns = ["SortItem", "Description"]
        if not all(col in sort_table.columns for col in required_columns):
            print(f"Warning: Missing required columns for SurveyID: {survey_id}")
            return "No Sort", None
        
        # Remove metadata rows and empty entries
        sort_table = sort_table[~sort_table.apply(lambda x: x.astype(str).str.contains('groups|N/A', case=False).any(), axis=1)]
        sort_table = sort_table.dropna(how='all')
        
        # If no valid rows remain after cleaning
        if sort_table.empty:
            return "No Sort", None
        
        # Create nodes from valid rows
        sort_nodes = []
        for _, valid_row in sort_table.iterrows():
            if pd.notna(valid_row["SortItem"]) and str(valid_row["SortItem"]).strip().upper() != "N/A":
                sort_nodes.append({
                    "SortItem": valid_row["SortItem"],
                    "Description": valid_row.get("Description", "N/A"),
                    "SurveyID": survey_id
                })
        
        return "Has Sort", sort_nodes if sort_nodes else None
        
    except Exception as e:
        print(f"Error processing SurveyID {survey_id}: {str(e)}")
        return "No Sort", None

def main():
    """Main execution function"""
    try:
        # Load SurveyNode data
        survey_df = load_dataframe('../data/data_extraction/SurveyNode.csv')
        
        # Initialize empty list for sort nodes
        all_sort_nodes = []
        
        # Process each row
        for index, row in track_progress(survey_df.iterrows(), description="Processing SortList", total=survey_df.shape[0]):
            has_sort, nodes = process_sort_list(row)
            survey_df.at[index, "Has Sort"] = has_sort
            
            if nodes:
                all_sort_nodes.extend(nodes)
        
        # Save updated SurveyNode data
        save_dataframe(survey_df, '../data/data_extraction/SurveyNode.csv')
        
        # Save SortListNode data if we have any nodes
        if all_sort_nodes:
            sort_list_df = pd.DataFrame(all_sort_nodes)
            save_dataframe(sort_list_df, '../data/data_extraction/SortListNode.csv')
        
        print(f"Processing completed successfully!")
        print(f"Total rows processed: {len(survey_df)}")
        print(f"Total sort nodes found: {len(all_sort_nodes)}")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Processing SortList: 100%|███████████████| 1648/1648 [09:42<00:00,  2.83items/s]

Processing completed successfully!
Total rows processed: 1648
Total sort nodes found: 0





## 3.3 Process 'Geography List' Column

In [38]:
import os
import pandas as pd
from progress_tracker import track_progress
from web_data_extraction import create_robust_session, fetch_url


def safe_strip(value):
    """
    Safely strip whitespace from a value, handling different data types.
    
    Parameters:
    -----------
    value : any
        The value to strip whitespace from.
    
    Returns:
    --------
    str
        Stripped string value, or empty string if input was None/NaN.
    """
    if pd.isna(value):
        return ""
    return str(value).strip()


def load_dataframe(file_path):
    """
    Load a CSV file into a pandas DataFrame.
    """
    return pd.read_csv(file_path)


def save_dataframe(df, file_path):
    """
    Save a pandas DataFrame to a CSV file.
    """
    df.to_csv(file_path, index=False)


def clean_geography_hierarchy(hierarchy_text):
    """
    Clean the geography hierarchy text by removing unnecessary markers and whitespace.
    
    Parameters:
    -----------
    hierarchy_text : str
        The geography hierarchy text to clean.
    
    Returns:
    --------
    str
        Cleaned hierarchy text.
    """
    if pd.isna(hierarchy_text):
        return ""
    
    # Convert to string in case we received a non-string type
    hierarchy_text = str(hierarchy_text)
    
    # Remove {.hier} and {.hier-sep} markers
    cleaned = hierarchy_text.replace("{.hier}", "").replace("{.hier-sep}", "")
    
    # Remove square brackets
    cleaned = cleaned.replace("[", "").replace("]", "")
    
    # Clean up any extra whitespace
    cleaned = " ".join(cleaned.split())
    
    return cleaned


def process_geography_list(row):
    """
    Process the Geography List column to determine if it has meaningful data and prepare a node structure.
    
    Parameters:
    -----------
    row : pandas.Series
        A single row of the DataFrame.
    
    Returns:
    --------
    tuple:
        - str: "Has Geography" or "No Geography" based on the presence of meaningful data.
        - list or None: Extracted Geography List data as a list of nodes, or None if no meaningful data exists.
    """
    # Use Geography List URL and SurveyID as key for linking nodes
    geography_list_url = row.get("Geography List")
    survey_id = row.get("SurveyID")

    # If the Geography List URL is missing or invalid, mark as "No Geography"
    if pd.isna(geography_list_url) or not isinstance(geography_list_url, str):
        return "No Geography", None

    # Fetch the URL content
    session = create_robust_session()
    html_content = fetch_url(geography_list_url, session)

    # If no content is retrieved, mark as "No Geography"
    if not html_content:
        return "No Geography", None

    # Parse the HTML to extract tables
    try:
        tables = pd.read_html(html_content)
    except ValueError:
        # No tables found in the HTML
        return "No Geography", None

    if not tables or len(tables) == 0:
        return "No Geography", None

    geography_table = tables[0]

    # Remove rows with 'items' in any column (e.g., "1 item" footer rows)
    geography_table = geography_table[~geography_table.apply(lambda x: x.astype(str).str.contains('item', case=False).any(), axis=1)]

    # Remove rows where "Geography Hierarchy" contains "(default geography)"
    geography_table = geography_table[
        ~geography_table["Geography Hierarchy"].astype(str).str.contains(r"\(default geography\)", na=False, case=False)
    ]

    # Remove rows where key columns ("Geography Level" or "Geography Hierarchy") are all `N/A`
    geography_table = geography_table.dropna(how="all", subset=["Geography Level", "Geography Hierarchy"])

    # If no meaningful rows are left, mark as "No Geography"
    if geography_table.empty:
        return "No Geography", None

    # Clean column names - remove any leading/trailing whitespace
    geography_table.columns = geography_table.columns.str.strip()

    # Convert remaining rows into GeographyListNode structure
    geography_nodes = []
    for _, valid_row in geography_table.iterrows():
        # Clean the hierarchy text
        hierarchy = clean_geography_hierarchy(valid_row.get("Geography Hierarchy", ""))
        
        geography_nodes.append({
            "ReferenceDate": safe_strip(valid_row.get("Reference Date", "")),
            "GeographyLevel": safe_strip(valid_row.get("Geography Level", "")),
            "GeographyHierarchy": hierarchy,
            "Limit": safe_strip(valid_row.get("Limit", "")),
            "SurveyID": safe_strip(survey_id)
        })

    return "Has Geography", geography_nodes


def process_rows(df, column_name, processing_function):
    """
    Process rows in a DataFrame for a specific column using a processing function.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame to process.
    column_name : str
        The column to process.
    processing_function : function
        The function to apply to each row for processing.
    
    Returns:
    --------
    tuple:
        - pandas.DataFrame: Updated DataFrame with processing results in the specified column.
        - list: List of nodes created during processing.
    """
    nodes = []

    # Add the column if it doesn't exist
    if column_name not in df.columns:
        df[column_name] = None

    # Process each row
    for index, row in track_progress(df.iterrows(), description=f"Processing {column_name}", total=df.shape[0]):
        has_data, node_list = processing_function(row)
        df.at[index, column_name] = has_data

        if node_list:
            nodes.extend(node_list)

    return df, nodes


# Main Script Logic

# Load the SurveyNode DataFrame
survey_df = load_dataframe('../data/data_extraction/SurveyNode.csv')

# Process Geography List column and update Has Geography status
survey_df, geography_nodes = process_rows(survey_df, "Has Geography", process_geography_list)

# Save the updated SurveyNode DataFrame
save_dataframe(survey_df, '../data/data_extraction/SurveyNode.csv')

# Save the GeographyListNode if nodes exist
if geography_nodes:
    geography_list_df = pd.DataFrame(geography_nodes)
    save_dataframe(geography_list_df, '../data/data_extraction/GeographyListNode.csv')

print("Geography List processing completed successfully!")


Processing Has Geography: 100%|██████████| 1648/1648 [08:31<00:00,  3.22items/s]

Geography List processing completed successfully!





## 3.4 Group Lists
- Check to see if there are groups (collections of selected variables) tied to this survey
- If there are none, then 'no groups' will be our node relationship
- If "Has Groups" we need to create the table of survey groups
- Each 'group' is specific to the individual survey. Sometimes variables change year to year for the same survey, hence the need to keep everything 'unique' for now anyways.

In [3]:
import os
import pandas as pd
from web_data_extraction import create_robust_session, fetch_url
from progress_tracker import track_progress
import json

def load_json_from_url(url, session):
    """Load JSON data from a URL using the provided session."""
    response = session.get(url)
    if response.status_code == 200:
        try:
            return response.json()
        except json.JSONDecodeError:
            return None
    return None

def process_group_json(row):
    """Process a single row to fetch and process the group JSON data."""
    group_list_url = row.get("Group List")
    survey_id = row.get("SurveyID")

    # Handle missing or invalid URL
    if pd.isna(group_list_url) or not isinstance(group_list_url, str):
        return "No Group", None

    # Modify URL to fetch JSON
    group_list_url = group_list_url.replace(".html", ".json")

    # Create session and fetch JSON
    session = create_robust_session()
    json_data = load_json_from_url(group_list_url, session)

    # If JSON data is empty or no groups, mark "No Group"
    if not json_data or not json_data.get("groups"):
        return "No Group", None

    # Parse groups from JSON data
    group_list = []
    for group in json_data.get("groups", []):
        group_list.append({
            "SurveyID": survey_id,
            "GroupName": group.get("name", ""),
            "GroupDescription": group.get("description", ""),
            "GroupLink": group.get("variables", "")
        })

    return "Has Group", group_list

def process_survey_groups(df, column_name):
    """
    Process the Group List column to determine if it has meaningful data
    and create SurveyGroupNode data.
    """
    survey_group_nodes = []

    # Add the column if it doesn't exist
    if column_name not in df.columns:
        df[column_name] = None

    # Add SurveyGroupID column
    if "SurveyGroupID" not in df.columns:
        df["SurveyGroupID"] = "NA"

    # Process each row
    for index, row in track_progress(df.iterrows(), description="Processing Group List", total=df.shape[0]):
        has_group, group_list = process_group_json(row)
        df.at[index, column_name] = has_group

        if group_list:
            # Generate SurveyGroupID for each group
            survey_group_ids = []
            for group in group_list:
                survey_group_id = f"{row['SurveyID']}_{group['GroupName']}"
                group["SurveyGroupID"] = survey_group_id
                survey_group_ids.append(survey_group_id)

            # Update SurveyGroupID column in SurveyNode table
            df.at[index, "SurveyGroupID"] = ";".join(survey_group_ids)  # Join multiple IDs with a semicolon

            # Append group nodes to the list
            survey_group_nodes.extend(group_list)

    return df, survey_group_nodes

# Main Script

# Load the SurveyNode DataFrame
survey_df = pd.read_csv("../data/data_extraction/SurveyNode-ex-srt-geo.csv")

# Process the Group List and generate SurveyGroupNode table
survey_df, survey_group_nodes = process_survey_groups(survey_df, "Has Group")

# Save the updated SurveyNode table
survey_df.to_csv("../data/data_extraction/SurveyNode-ex-srt-geo-grp.csv", index=False)

# Save the SurveyGroupNode table
if survey_group_nodes:
    survey_group_df = pd.DataFrame(survey_group_nodes)
    survey_group_df.to_csv("../data/data_extraction/SurveyGroupNode.csv", index=False)

print("Survey Group processing completed successfully!")


Processing Group List: 100%|█████████████| 1648/1648 [14:17<00:00,  1.92items/s]


Survey Group processing completed successfully!


## 3.5 Build the GroupNode table with all groups
- This is more complex due to the fact that the HTML table for a group (group.html) contains more information provided by the group.json PLUS each variable's individual json file (name column is link to the variable.json)
- We need to break this up into two parts:
  1. extract the group.html data
  2. enrich with additional variable metadata (critical: 'attribute' and 'attribute of' are relational data for our graph database!)

### 3.5.1 Extract the Group.html info
- We will break this up by first getting all the html table info
- A batch size of 500 with the ability to restart at the last batch (at 64k rows, it's long)
- An error log, which we can try to ingest from
- adding extra columns, linkage info (eg SurveyID and SurveyGroupID)storing both the variable name and the variable.json link


In [45]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from progress_tracker import track_progress
from web_data_extraction import create_robust_session


def replace_extension(group_link, new_extension="html"):
    if not isinstance(group_link, str) or not group_link.endswith('.json'):
        return None
    return group_link.replace('.json', f'.{new_extension}')


def parse_html_group_table(html_content, survey_id, survey_group_id):
    """Parse HTML content containing a group table and extract relevant fields."""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table
    table = soup.find('table')
    if table is None:
        return pd.DataFrame()  # Return empty DataFrame
    
    # Initialize lists for data collection
    data = []
    
    # Extract headers
    headers = [th.text.strip() for th in table.find_all('th')]
    if not headers:
        return pd.DataFrame()

    # Add additional header for the variable link
    if "Name" in headers:
        name_index = headers.index("Name")
        headers.insert(name_index + 1, "Variable Link")  # Add a column for the link

    # Dynamically adjust headers to match the longest row
    max_columns = max(len(row.find_all('td')) for row in table.find_all('tr') if row.find_all('td'))
    if len(headers) < max_columns:
        headers += [f"Extra_Column_{i}" for i in range(len(headers), max_columns)]

    # Extract rows
    for row in table.find_all('tr')[1:]:  # Skip header row
        cells = row.find_all('td')
        if len(cells) < len(headers) - 1:
            # Skip obviously malformed rows
            continue
        
        row_data = []
        for i, cell in enumerate(cells):
            # Handle the Name column (extract text and link)
            if i == name_index:
                link = cell.find('a')
                if link:
                    row_data.append(link.text.strip())  # Variable Name
                    row_data.append(link.get('href', ''))  # Variable Link
                else:
                    row_data.append(cell.text.strip())  # Add name even if no link
                    row_data.append(None)  # No link available
            else:
                row_data.append(cell.text.strip())

        if len(row_data) < len(headers):
            row_data += [None] * (len(headers) - len(row_data))  # Pad missing columns
        
        data.append(row_data)

    if not data:
        return pd.DataFrame()

    # Create DataFrame
    try:
        df = pd.DataFrame(data, columns=headers[:len(data[0])])  # Align columns dynamically
        # Add metadata columns
        df['SurveyID'] = survey_id
        df['SurveyGroupID'] = survey_group_id
        return df
    except Exception as e:
        print(f"Error creating DataFrame: {e}")
        return pd.DataFrame()


def process_group_htmls(survey_group_df, output_chunk_csv, error_log_csv, session):
    """Process group HTMLs and save results."""
    all_group_data = []
    error_log = []
    processed_rows = 0

    for _, row in track_progress(survey_group_df.iterrows(), 
                               description="Processing Groups", 
                               total=survey_group_df.shape[0]):
        
        group_link = replace_extension(row['GroupLink'])
        if not group_link:
            error_log.append({
                "SurveyGroupID": row['SurveyGroupID'],
                "SurveyID": row['SurveyID'],
                "GroupLink": row['GroupLink'],
                "Error": "Invalid GroupLink"
            })
            continue
            
        try:
            # Fetch HTML
            response = session.get(group_link)
            response.raise_for_status()
            
            # Parse table
            group_data = parse_html_group_table(
                response.text,
                row['SurveyID'],
                row['SurveyGroupID']
            )
            
            if not group_data.empty:
                all_group_data.append(group_data)
                processed_rows += 1
            else:
                error_log.append({
                    "SurveyGroupID": row['SurveyGroupID'],
                    "SurveyID": row['SurveyID'],
                    "GroupLink": group_link,
                    "Error": "No data rows found"
                })
            
        except Exception as e:
            # Log error and stop processing
            error_log.append({
                "SurveyGroupID": row['SurveyGroupID'],
                "SurveyID": row['SurveyID'],
                "GroupLink": group_link,
                "Error": str(e)
            })
            print(f"Error encountered. Stopping processing.")
            break

    # Save all collected data to the single output CSV
    if all_group_data:
        # Concatenate all collected dataframes
        final_df = pd.concat(all_group_data, ignore_index=True)
        final_df.to_csv(output_chunk_csv, index=False)
        print(f"Saved chunk to {output_chunk_csv}")

    # Save error log
    if error_log:
        pd.DataFrame(error_log).to_csv(error_log_csv, index=False, mode='a')
        print(f"Error log saved to {error_log_csv}")


def save_chunk(data, output_folder, chunk_index):
    """Save a chunk of processed data to a CSV file."""
    os.makedirs(output_folder, exist_ok=True)
    chunk_path = os.path.join(output_folder, f"GroupNode_{chunk_index:03d}.csv")
    pd.concat(data, ignore_index=True).to_csv(chunk_path, index=False)
    print(f"Saved chunk {chunk_index} to {chunk_path}")

import os
import re
import pandas as pd

def get_resume_start_index(output_folder, survey_group_df):
    """
    Determine the starting index for resumption by checking existing chunks.
    
    Parameters:
    -----------
    output_folder : str
        Path to the folder where output chunks are stored.
    survey_group_df : pd.DataFrame
        Input DataFrame containing the full dataset.
    
    Returns:
    --------
    int
        Starting index for resumption.
    """
    
    # Check for existing files in the output folder
    chunk_files = [f for f in os.listdir(output_folder) if f.startswith("GroupNode_chunk_") and f.endswith(".csv")]
    
    if not chunk_files:
        # No files processed yet; start from the beginning
        return 0
    
    # Extract the last processed chunk number
    chunk_numbers = [
        int(re.search(r"GroupNode_chunk_(\d+).csv", file).group(1))
        for file in chunk_files
        if re.search(r"GroupNode_chunk_(\d+).csv", file)
    ]
    
    if not chunk_numbers:
        return 0  # Start from the beginning if no valid chunk files are found
    
    # Find the maximum chunk number and calculate start index
    last_chunk_number = max(chunk_numbers)
    last_chunk_size = len(pd.read_csv(os.path.join(output_folder, f"GroupNode_chunk_{last_chunk_number:03d}.csv")))
    start_index = last_chunk_number * chunk_size + last_chunk_size
    
    print(f"Resuming from index {start_index}, skipping {last_chunk_number} completed chunks.")
    return start_index

if __name__ == "__main__":
    # Create session
    session = create_robust_session()
    
    # Paths
    input_csv = '../data/data_extraction/SurveyGroupNode.csv'
    output_folder = '../data/data_extraction/GroupNodeChunks'
    error_log_csv = '../data/data_extraction/GroupNode_errors.csv'
    
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    try:
        # Process input data in chunks
        chunk_size = 500
        chunk_index = 0
        
        for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
            # Define chunk-specific output path
            output_chunk_csv = os.path.join(output_folder, f'GroupNode_chunk_{chunk_index:03d}.csv')
            
            # Skip processing if the chunk already exists
            if os.path.exists(output_chunk_csv):
                print(f"Chunk {chunk_index} already processed. Skipping.")
                chunk_index += 1
                continue
        
            # Process the chunk
            process_group_htmls(chunk, output_chunk_csv, error_log_csv, session)
            print(f"Chunk {chunk_index} processed successfully.")
            
            chunk_index += 1
    
    except Exception as e:
        print(f"Error reading input file in chunks: {str(e)}")





Chunk 0 already processed. Skipping.


Processing Groups: 100%|███████████████████| 500/500 [00:54<00:00,  9.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_001.csv
Chunk 1 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:58<00:00,  8.60items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_002.csv
Chunk 2 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:57<00:00,  8.67items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_003.csv
Chunk 3 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:56<00:00,  8.79items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_004.csv
Chunk 4 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:56<00:00,  8.78items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_005.csv
Chunk 5 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:57<00:00,  8.68items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_006.csv
Chunk 6 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:57<00:00,  8.69items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_007.csv
Chunk 7 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:57<00:00,  8.73items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_008.csv
Chunk 8 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:58<00:00,  8.51items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_009.csv
Chunk 9 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:57<00:00,  8.73items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_010.csv
Chunk 10 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:59<00:00,  8.42items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_011.csv
Chunk 11 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:01<00:00,  8.07items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_012.csv
Chunk 12 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:11<00:00,  6.96items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_013.csv
Chunk 13 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:09<00:00,  7.23items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_014.csv
Chunk 14 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:08<00:00,  7.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_015.csv
Chunk 15 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.79items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_016.csv
Chunk 16 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:03<00:00,  7.82items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_017.csv
Chunk 17 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.80items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_018.csv
Chunk 18 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.57items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_019.csv
Chunk 19 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.76items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_020.csv
Chunk 20 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.77items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_021.csv
Chunk 21 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.51items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_022.csv
Chunk 22 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.56items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_023.csv
Chunk 23 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.41items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_024.csv
Chunk 24 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  7.94items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_025.csv
Chunk 25 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:03<00:00,  7.86items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_026.csv
Chunk 26 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  7.98items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_027.csv
Chunk 27 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.64items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_028.csv
Chunk 28 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.69items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_029.csv
Chunk 29 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.39items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_030.csv
Chunk 30 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.38items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_031.csv
Chunk 31 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.37items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_032.csv
Chunk 32 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.59items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_033.csv
Chunk 33 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:03<00:00,  7.91items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_034.csv
Chunk 34 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:03<00:00,  7.82items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_035.csv
Chunk 35 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:19<00:00,  6.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_036.csv
Chunk 36 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:21<00:00,  6.15items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_037.csv
Chunk 37 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  7.94items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_038.csv
Chunk 38 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.61items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_039.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 39 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:10<00:00,  7.08items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_040.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 40 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:08<00:00,  7.33items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_041.csv
Chunk 41 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.69items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_042.csv
Chunk 42 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:55<00:00,  9.00items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_043.csv
Chunk 43 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:58<00:00,  8.58items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_044.csv
Chunk 44 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:59<00:00,  8.42items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_045.csv
Chunk 45 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.41items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_046.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 46 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:09<00:00,  7.18items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_047.csv
Chunk 47 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.69items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_048.csv
Chunk 48 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:16<00:00,  6.55items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_049.csv
Chunk 49 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.38items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_050.csv
Chunk 50 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.76items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_051.csv
Chunk 51 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:14<00:00,  6.68items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_052.csv
Chunk 52 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.59items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_053.csv
Chunk 53 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.91items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_054.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 54 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.58items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_055.csv
Chunk 55 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.42items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_056.csv
Chunk 56 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:19<00:00,  6.31items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_057.csv
Chunk 57 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.47items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_058.csv
Chunk 58 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.87items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_059.csv
Chunk 59 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.85items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_060.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 60 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:01<00:00,  8.09items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_061.csv
Chunk 61 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.79items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_062.csv
Chunk 62 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.76items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_063.csv
Chunk 63 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:03<00:00,  7.90items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_064.csv
Chunk 64 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:04<00:00,  7.74items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_065.csv
Chunk 65 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:13<00:00,  6.83items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_066.csv
Chunk 66 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  8.05items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_067.csv
Chunk 67 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:25<00:00,  5.85items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_068.csv
Chunk 68 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:20<00:00,  6.22items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_069.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 69 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:08<00:00,  7.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_070.csv
Chunk 70 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:22<00:00,  6.08items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_071.csv
Chunk 71 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:15<00:00,  6.64items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_072.csv
Chunk 72 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:09<00:00,  7.17items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_073.csv
Chunk 73 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:17<00:00,  6.49items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_074.csv
Chunk 74 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  8.04items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_075.csv
Chunk 75 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:10<00:00,  7.04items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_076.csv
Chunk 76 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:58<00:00,  8.49items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_077.csv
Chunk 77 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.94items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_078.csv
Chunk 78 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:28<00:00,  5.65items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_079.csv
Chunk 79 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:10<00:00,  7.09items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_080.csv
Chunk 80 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:08<00:00,  7.27items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_081.csv
Chunk 81 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:27<00:00,  5.72items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_082.csv
Chunk 82 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:11<00:00,  6.99items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_083.csv
Chunk 83 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:11<00:00,  6.96items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_084.csv
Chunk 84 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:18<00:00,  6.34items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_085.csv
Chunk 85 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.45items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_086.csv
Chunk 86 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  7.98items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_087.csv
Chunk 87 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:20<00:00,  6.18items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_088.csv
Chunk 88 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:09<00:00,  7.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_089.csv
Chunk 89 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.92items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_090.csv
Chunk 90 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:25<00:00,  5.83items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_091.csv
Chunk 91 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.86items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_092.csv
Chunk 92 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:19<00:00,  6.27items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_093.csv
Chunk 93 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:03<00:00,  7.81items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_094.csv
Chunk 94 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:08<00:00,  7.26items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_095.csv
Chunk 95 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.51items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_096.csv
Chunk 96 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:23<00:00,  5.98items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_097.csv
Chunk 97 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:11<00:00,  7.01items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_098.csv
Chunk 98 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:26<00:00,  5.80items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_099.csv
Chunk 99 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.50items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_100.csv
Chunk 100 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:11<00:00,  6.96items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_101.csv
Chunk 101 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:25<00:00,  5.86items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_102.csv
Chunk 102 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:31<00:00,  5.47items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_103.csv
Chunk 103 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.62items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_104.csv
Chunk 104 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:22<00:00,  6.05items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_105.csv
Chunk 105 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:10<00:00,  7.10items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_106.csv
Chunk 106 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:21<00:00,  6.13items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_107.csv
Chunk 107 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:32<00:00,  5.42items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_108.csv
Chunk 108 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:20<00:00,  6.19items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_109.csv
Chunk 109 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:02<00:00,  7.99items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_110.csv
Chunk 110 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:00<00:00,  8.22items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_111.csv
Chunk 111 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:58<00:00,  8.51items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_112.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 112 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [00:59<00:00,  8.36items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_113.csv
Error log saved to ../data/data_extraction/GroupNode_errors.csv
Chunk 113 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:20<00:00,  6.24items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_114.csv
Chunk 114 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:12<00:00,  6.89items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_115.csv
Chunk 115 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:19<00:00,  6.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_116.csv
Chunk 116 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:13<00:00,  6.83items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_117.csv
Chunk 117 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.61items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_118.csv
Chunk 118 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:13<00:00,  6.82items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_119.csv
Chunk 119 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:23<00:00,  6.02items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_120.csv
Chunk 120 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:07<00:00,  7.38items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_121.csv
Chunk 121 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:05<00:00,  7.60items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_122.csv
Chunk 122 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:19<00:00,  6.25items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_123.csv
Chunk 123 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.53items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_124.csv
Chunk 124 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:17<00:00,  6.44items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_125.csv
Chunk 125 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:00<00:00,  8.20items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_126.csv
Chunk 126 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:06<00:00,  7.51items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_127.csv
Chunk 127 processed successfully.


Processing Groups: 100%|███████████████████| 500/500 [01:08<00:00,  7.29items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_128.csv
Chunk 128 processed successfully.


Processing Groups: 100%|███████████████████| 373/373 [01:00<00:00,  6.19items/s]


Saved chunk to ../data/data_extraction/GroupNodeChunks/GroupNode_chunk_129.csv
Chunk 129 processed successfully.


## 3.5.1.1 Process the Error Log -- Special Suppliment: Workflow for Cyclic Processing of Errors

- Despite best efforts, not all files will process and this necessitates itterating the above process.
- Save off all itterations of the error file as backup/evidence (Cycle1..CycleN folder)
- Manually add the additional batchs, increment as appropriate to the GroupNode folder
- Worst case scenario is manual download to avoid whatever is causing the issue

  >NOTE: It took four cycles to get everything on the original run.


### **Step-by-Step Guide**

### **1. Initial Run:**
- Place your raw `GroupNode_errors.csv` file in the specified `error_file` path.
- Run the script:
  - The script will clean the error log and process the cleaned data in chunks.
  - Results are saved in the `Reprocessed_GroupNodeChunks` folder.
  - Errors from processing are logged into a new `Reprocessed_GroupNode_errors.csv`.

---

### **2. Subsequent Runs for Remaining Errors:**
- Use the latest `Reprocessed_GroupNode_errors.csv` as your new `error_file`.
- Repeat the script:
  - Clean this new error file.
  - Process the cleaned data.
  - Save processed chunks and log new errors.
- **Verify Results:**
  - Confirm that the number of rows in the error file decreases with each iteration.

---

### **3. Final Check:**
- Keep cycling the error file through the script until:
  - The `Reprocessed_GroupNode_errors.csv` file is either empty or contains only truly invalid records (e.g., broken links, malformed tables).
- The final consolidated data will reside in the `Reprocessed_GroupNodeChunks` folder.

---

## **Plan for Edge Case Handling**

### **Duplicate Records in Error File:**
- If the same records appear repeatedly in the error log despite being processed, investigate:
  - Table structure changes in those HTML files.
  - Potential issues with URL accessibility (e.g., timeouts, restrictions).

---

### **Stuck or Missing Rows:**
- Manually inspect `Reprocessed_GroupNode_errors.csv` for persistent errors.
- Validate the URLs or files manually to confirm if they are genuinely problematic.

---

## **Additional Tip for Manual Validations:**
To streamline manual inspections:
1. Use a browser or Postman to validate individual URLs directly from the error file.
2. Fix any recurring issues in the HTML parser (`parse_html_group_table`)


In [87]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from progress_tracker import track_progress


def create_robust_session():
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0'})
    return session


def parse_html_group_table(html_content, survey_id, survey_group_id):
    """
    Parse HTML content containing a group table and extract relevant fields.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    if not table:
        print(f"No table found for SurveyGroupID {survey_group_id}")
        return pd.DataFrame()

    # Extract headers
    header_row = table.find('tr')
    headers = [th.text.strip() for th in header_row.find_all(['th', 'td'])] if header_row else []
    if not headers:
        print(f"No headers found for SurveyGroupID {survey_group_id}")
        return pd.DataFrame()

    # Extract rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip header row
        cells = row.find_all('td')
        if cells:
            row_data = []
            variable_link = None  # Placeholder for Variable Link column
            for cell in cells:
                link = cell.find('a')
                if link:
                    row_data.append(cell.text.strip())  # Add variable name
                    variable_link = link.get('href', '')  # Capture URL
                else:
                    row_data.append(cell.text.strip())
            if variable_link:
                row_data.append(variable_link)  # Append Variable Link at the end
            rows.append(row_data)

    # Ensure headers match expanded rows
    if any(len(row) > len(headers) for row in rows):
        headers += ["Variable Link"]

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df['SurveyID'] = survey_id
    df['SurveyGroupID'] = survey_group_id

    # Drop rows where both 'Group' and 'Label' are blank
    df = df[~(df['Group'].isna() & df['Label'].isna())]

    # Reorder columns as needed
    desired_column_order = [
        "Name", "Variable Link", "Label", "Concept", "Required", "Attributes",
        "Limit", "Predicate Type", "Group", "SurveyID", "SurveyGroupID"
    ]
    existing_columns = [col for col in desired_column_order if col in df.columns]
    remaining_columns = [col for col in df.columns if col not in existing_columns]
    df = df[existing_columns + remaining_columns]

    return df



def reprocess_cleaned_data(cleaned_file, output_folder, session, chunk_size=100):
    """
    Reprocess cleaned data from a consolidated file.
    """
    cleaned_df = pd.read_csv(cleaned_file)
    all_group_data = []
    errors = []

    for chunk_start in range(0, len(cleaned_df), chunk_size):
        chunk = cleaned_df.iloc[chunk_start:chunk_start + chunk_size]
        chunk_index = chunk_start // chunk_size

        output_chunk_csv = os.path.join(output_folder, f"Reprocessed_GroupNode_chunk_{chunk_index:03d}.csv")
        error_chunk_log = os.path.join(output_folder, f"Reprocessed_GroupNode_errors_chunk_{chunk_index:03d}.csv")

        chunk_group_data = []
        chunk_errors = []

        for _, row in track_progress(chunk.iterrows(), description=f"Processing Chunk {chunk_index}", total=len(chunk)):
            try:
                group_link = row['GroupLink']
                survey_group_id = row['SurveyGroupID']
                survey_id = row['SurveyID']

                response = session.get(group_link)
                response.raise_for_status()

                group_data = parse_html_group_table(response.text, survey_id, survey_group_id)
                if not group_data.empty:
                    chunk_group_data.append(group_data)
                else:
                    chunk_errors.append(row.to_dict())
            except Exception as e:
                chunk_errors.append({**row.to_dict(), "Error": str(e)})

        # Save chunk results
        if chunk_group_data:
            pd.concat(chunk_group_data, ignore_index=True).to_csv(output_chunk_csv, index=False)
            print(f"Chunk {chunk_index} processed successfully. Saved to {output_chunk_csv}")

        # Save errors for the chunk
        if chunk_errors:
            pd.DataFrame(chunk_errors).to_csv(error_chunk_log, index=False)
            print(f"Errors logged to {error_chunk_log}")

    print("Reprocessing completed successfully!")


def clean_error_file(error_file, cleaned_output_file):
    """
    Cleans an error file by removing duplicated header rows and rows with missing or invalid data.
    """
    try:
        df = pd.read_csv(error_file)

        # Remove duplicate headers and rows missing critical columns
        df = df[~df.apply(lambda x: x.str.contains("SurveyGroupID|SurveyID|GroupLink|Error", na=False).all(), axis=1)]
        df = df.dropna(subset=["GroupLink", "SurveyGroupID"])
        df = df[df['GroupLink'].str.startswith("http", na=False)]

        df.to_csv(cleaned_output_file, index=False)
        print(f"Cleaned error log saved to {cleaned_output_file}")
        return df
    except Exception as e:
        print(f"Error cleaning file {error_file}: {e}")
        return pd.DataFrame()


if __name__ == "__main__":
    # Define file paths
    error_file = "../data/data_extraction/GroupNode_errors.csv"
    cleaned_error_file = "../data/data_extraction/Cleaned_GroupNode_errors.csv"
    output_folder = "../data/data_extraction/Reprocessed_GroupNodeChunks"

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Step 1: Clean the error file
    cleaned_df = clean_error_file(error_file, cleaned_error_file)
    if cleaned_df.empty:
        print("No valid rows to process after cleaning. Exiting.")
        exit(0)

    # Step 2: Create session
    session = create_robust_session()

    # Step 3: Reprocess cleaned data
    reprocess_cleaned_data(cleaned_error_file, output_folder, session, chunk_size=500)


Cleaned error log saved to ../data/data_extraction/Cleaned_GroupNode_errors.csv


Processing Chunk 0: 100%|██████████████████████| 1/1 [00:00<00:00,  3.40items/s]

Chunk 0 processed successfully. Saved to ../data/data_extraction/Reprocessed_GroupNodeChunks/Reprocessed_GroupNode_chunk_000.csv
Reprocessing completed successfully!





## 3.5.1.2 Supplimentary Error Processing Log

- the URL processing for 'Variable Link' inadvertently stored the group URL
- Rather than reprocess everything, and JIC its an issue elsewhere, we'll just fix as a batch
- Something like this might be useful for more URL fixing at some point and we can reuse code from this

In [96]:
import os
import pandas as pd

def fix_group_url(url, name):
    """
    Fixes a group URL to point to the correct variable URL.
    """
    if "/groups/" in url and url.endswith(".html"):
        base_url = url.split('/groups/')[0]  # Get everything before '/groups/'
        return f"{base_url}/variables/{name}.json"
    return url

def batch_url_transformation(input_files, output_folder):
    """
    Processes the Variable Link column in the given batch of files,
    applying the fix_group_url logic and saving the result.
    """
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    for chunk_file in input_files:
        # Load the chunk file
        df = pd.read_csv(chunk_file)

        # Transform and replace the Variable Link column
        df['Variable Link'] = df.apply(
            lambda row: fix_group_url(row['Variable Link'], row['Name']), axis=1
        )

        # Prepare output file path
        output_file = os.path.join(
            output_folder,
            os.path.basename(chunk_file)
        )

        # Save the updated DataFrame
        df.to_csv(output_file, index=False)
        print(f"Processed file saved to {output_file}")

# Define input files and output folder
input_files = [
    "../data/data_extraction/GroupNodeChunks/GroupNode_chunk_130.csv",
    "../data/data_extraction/GroupNodeChunks/GroupNode_chunk_131.csv",
    "../data/data_extraction/GroupNodeChunks/GroupNode_chunk_132.csv",
    "../data/data_extraction/GroupNodeChunks/GroupNode_chunk_133.csv"
]
output_folder = "../data/data_extraction/Group_url_fix/"

# Run the batch processing
batch_url_transformation(input_files, output_folder)


Processed file saved to ../data/data_extraction/Group_url_fix/GroupNode_chunk_130.csv
Processed file saved to ../data/data_extraction/Group_url_fix/GroupNode_chunk_131.csv
Processed file saved to ../data/data_extraction/Group_url_fix/GroupNode_chunk_132.csv
Processed file saved to ../data/data_extraction/Group_url_fix/GroupNode_chunk_133.csv


## 3.5.2 Processes each Variable JSON file in the GroupNode Table
- Reminder: variable JSON file contains `attribues` which define directional relationships with other variables.
- **Goal**:get the attributes and add the relationship as extra columns

**NOTE** Single file processing was too time consuming. A python script was created to launch 5 concurent sessions and extract all the data. This process, errors nonwidthstanding, took 3.25 days. More detail is in Section 3.5.2.2 below.

## Documentation: Processing Census GroupNode Chunks

## Overview
This process handles the sequential processing of Census GroupNode chunks stored in CSV format. The data includes variable links pointing to JSON files containing additional metadata. The goal is to enhance the dataset by:
1. Constructing full URLs for the JSON files.
2. Fixing any inconsistencies in file extensions.
3. Extracting metadata from the JSON files and appending it to the dataset.

---

## Step-by-Step Logic

### 1. Read Data Sequentially
- Load each chunk file from the `GroupNodeChunks` folder in sequence.
- Use a loop to iterate through all the files stored in the folder.

### 2. Construct Full URLs
- Census provides variable links in a relative path format (e.g., `/data/2000/dec/aian/variables/HCT110005.json`).
- Prepend the base URL `https://api.census.gov` to each relative path in the `Variable Link` column.
- Replace `.html` extensions with `.json` if any URLs are incorrectly formatted.

### 3. Add New Columns
- Add the following columns to the dataset with default values:
  - **`Has Attribute`**: Default to `'no attribute'`. Updated to `'has attribute'` if the JSON file contains the `attributes` field.
  - **`Attribute of`**: Default to `'NA'`. Updated to the value of the `attribute of` field if present in the JSON file.
  - **`Attribute Type`**: Default to `'NA'`. Updated to the value of the `attribute type` field if present in the JSON file.

### 4. Process Each Row
- For each row in the dataset:
  1. Fetch the JSON file from the `Variable Link` URL using an HTTP GET request.
  2. Parse the JSON response and extract relevant fields:
     - **`attributes`**: If present, update the `Has Attribute` column to `'has attribute'` and populate the `attributes` column with the value.
     - **`attribute of`**: If present, populate the `Attribute of` column.
     - **`attribute type`**: If present, populate the `Attribute Type` column.
  3. Handle any errors gracefully and log them for debugging.

### 5. Save the Processed File
- Save the updated dataset to a new CSV file in the `Processed_GroupNodeChunks` folder.

---

## Example JSON File
```json
{
  "name": "DP02_0001PM",
  "label": "Percent!!Margin of Error!!HOUSEHOLDS BY TYPE!!Total households",
  "concept": "Selected Social Characteristics in the United States: 2008",
  "predicateType": "int",
  "group": "DP02",
  "limit": 0,
  "attributes": "DP02_0001PMA",
  "attribute of": "DP02_0001PE",
  "attribute type": "Margin of Error"
}



## 3.5.2.1 Single Chunk Example for Demonstrating Functionality 

```
import os
import pandas as pd
import json
import requests
from progress_tracker import track_progress

# Create robust session
def create_robust_session():
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0'})
    return session

# Process one chunk file
def process_groupnode_chunk(chunk_file, output_folder):
    base_url = "https://api.census.gov"
    session = create_robust_session()
    
    # Load the chunk
    df = pd.read_csv(chunk_file)
    
    # Rename 'Name' column to 'Variable Name'
    df.rename(columns={'Name': 'Variable Name'}, inplace=True)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Prepare paths for output
    chunk_name = os.path.splitext(os.path.basename(chunk_file))[0]
    output_file = os.path.join(output_folder, f"Processed_{chunk_name}.csv")
    error_log_file = os.path.join(output_folder, f"Errors_{chunk_name}.csv")

    # Ensure the full URL and clean extensions
    df['Variable Link'] = df['Variable Link'].apply(lambda x: f"{base_url}{x}" if x.startswith('/data') else x)
    df['Variable Link'] = df['Variable Link'].str.replace('.html', '.json')

    # Ensure existing columns have the correct dtype
    df['Attributes'] = df['Attributes'].astype('object')
    if 'Attribute Of' not in df.columns:
        df['Attribute Of'] = pd.Series(dtype='object')
    if 'Attribute Type' not in df.columns:
        df['Attribute Type'] = pd.Series(dtype='object')

    errors = []  # To store error information

    # Process each variable
    for i, row in track_progress(df.iterrows(), description=f"Processing {chunk_name}", total=len(df)):
        variable_url = row['Variable Link']
        
        try:
            # Request the JSON
            response = session.get(variable_url)
            response.raise_for_status()
            variable_data = response.json()
            
            # Update columns based on JSON content
            df.at[i, 'Attributes'] = variable_data.get('attributes', 'na')
            df.at[i, 'Attribute Of'] = variable_data.get('attribute of', 'na')
            df.at[i, 'Attribute Type'] = variable_data.get('attribute type', 'na')
        except Exception as e:
            # Capture errors in the log
            errors.append({
                "Index": i,
                "Variable Name": row.get('Variable Name', 'na'),
                "Group": row.get('Group', 'na'),
                "Variable Link": variable_url,
                "Error": str(e)
            })

    # Reorder columns
    column_order = [
        "SurveyID", "SurveyGroupID", "Group", "Variable Name", "Variable Link",
        "Label", "Concept", "Required", "Attributes", "Attribute Of", 
        "Attribute Type", "Limit", "Predicate Type"   
    ]
    df = df[column_order]
    
    # Save the processed DataFrame
    df.to_csv(output_file, index=False)
    print(f"Processed chunk saved to {output_file}")
    
    # Save the error log if there are any errors
    if errors:
        error_df = pd.DataFrame(errors)
        error_df.to_csv(error_log_file, index=False)
        print(f"Error log saved to {error_log_file}")

# Batch processing
def process_all_chunks(input_folder, output_folder, start_chunk=0):
    # Get a list of all files in the directory
    all_files = sorted(
        [f for f in os.listdir(input_folder) if f.startswith("GroupNode_chunk_") and f.endswith(".csv")]
    )
    
    # Sort files by their numeric chunk number
    all_files = sorted(all_files, key=lambda x: int(x.split("_")[-1].split(".")[0]))

    # Filter files to start from the specified chunk
    files_to_process = [
        os.path.join(input_folder, f) for f in all_files
        if int(f.split("_")[-1].split(".")[0]) >= start_chunk
    ]
    
    print(f"Found {len(files_to_process)} files to process starting from chunk {start_chunk}.")

    # Process each file
    for chunk_file in files_to_process:
        process_groupnode_chunk(chunk_file, output_folder)

# Example: Process all chunks starting from a specific number
input_folder = "../data/data_extraction/GroupNodeChunks"
output_folder = "../data/data_extraction/ValidationSet-GroupNodesWithVariables"
start_chunk = 133  # Change this to the chunk number to start from (e.g., 132)

process_all_chunks(input_folder, output_folder, start_chunk)
```


# We used the multi-core processing rather than single file ...
## Discuss file in /src to explain how to use
## Additional processing for error file logs still needed. 

## Error File Processing Script for Errors in Multi-Core Processing Process

This script processes an error file for a specific chunk of the main table, fetches missing data using the `Variable Link`, and updates the main table with corrected values.

## Key Features
- **Dynamic Chunk Selection**: Specify the chunk number (`chunk_number`) to process the corresponding error file and main table.
- **Data Fetching and Updates**: Retrieves JSON data from the `Variable Link` in the error file and updates the `Attributes`, `Attribute Of`, and `Attribute Type` columns in the main table.
- **Error Logging**: Logs all unresolved errors into a new error file named `<error_file>--RemainingToFix.csv` for retrying later.
- **Output Files**:
  - The updated main table is saved as `<chunk_file>-FIXED.csv`.
  - Any unresolved errors are saved for future processing.

This ensures incremental error correction while maintaining logs for unresolved issues.

**NOTE** This script was run one error file at a time, all error files and logs are saved to another directory for historical purposes.


In [None]:
import pandas as pd
import requests
from progress_tracker import track_progress
from web_data_extraction import create_robust_session
import logging
import os

# Configure logging
logging.basicConfig(
    filename="error_processing.log",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Specify the chunk number to process
chunk_number = "065"

# File paths
main_table_path = f"../data/data_extraction/GroupNodesWithVariables/Processed_GroupNode_chunk_{chunk_number}.csv"
error_file_path = f"../data/data_extraction/GroupNodesWithVariables/Errors_GroupNode_chunk_{chunk_number}.csv"
updated_main_table_path = f"../data/data_extraction/GroupNodesWithVariables/Processed_GroupNode_chunk_{chunk_number}-FIXED.csv"
remaining_errors_path = f"../data/data_extraction/GroupNodesWithVariables/Errors_GroupNode_chunk_{chunk_number}--RemainingToFix.csv"

# Load the tables
main_df = pd.read_csv(main_table_path)
errors_df = pd.read_csv(error_file_path)

# Create robust session
session = create_robust_session()

# Track remaining errors
remaining_errors = []

# Process the error file
for i, row in track_progress(errors_df.iterrows(), description=f"Processing Errors (Chunk {chunk_number})", total=len(errors_df)):
    variable_link = row['Variable Link']
    variable_name = row['Variable Name']
    index = row['Index']
    
    try:
        # Fetch JSON data from the Variable Link
        response = session.get(variable_link)
        response.raise_for_status()
        variable_data = response.json()
        
        # Extract the updated data
        updated_data = {
            "Attributes": variable_data.get('attributes', 'na'),
            "Attribute Of": variable_data.get('attribute of', 'na'),
            "Attribute Type": variable_data.get('attribute type', 'na')
        }
        
        # Retrieve the corresponding row from the main table
        if index in main_df.index:
            main_table_link = main_df.at[index, 'Variable Link']
            if variable_link == main_table_link:
                # Update the main table with new data
                main_df.at[index, 'Attributes'] = updated_data['Attributes']
                main_df.at[index, 'Attribute Of'] = updated_data['Attribute Of']
                main_df.at[index, 'Attribute Type'] = updated_data['Attribute Type']
            else:
                raise ValueError(f"Variable Link mismatch at index {index}.")
        else:
            raise KeyError(f"Index {index} not found in main table.")

    except Exception as e:
        # Capture and log the error
        logging.error(f"Error processing Variable Link {variable_link} (Index {index}): {e}")
        remaining_errors.append({
            "Index": index,
            "Variable Name": variable_name,
            "Variable Link": variable_link,
            "Error": str(e)
        })

# Save the updated main table
main_df.to_csv(updated_main_table_path, index=False)
print(f"Updated main table saved to {updated_main_table_path}")

# Save remaining errors
if remaining_errors:
    remaining_errors_df = pd.DataFrame(remaining_errors)
    remaining_errors_df.to_csv(remaining_errors_path, index=False)
    print(f"Remaining errors saved to {remaining_errors_path}")
else:
    print("No remaining errors!")


Processing Errors (Chunk 065):  45%|▍| 20357/45214 [1:45:07<2:01:12,  3.42items/

In [195]:
errors_df.head()


Unnamed: 0,Chunk File,Index,Variable Name,Variable Link,Error
0,../data/data_extraction/GroupNodeChunks/GroupNode_chunk_064.csv,0,B17015_001E,https://api.census.gov/data/2013/acs/acs1/variables/B17015_001E.json,"HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2013/acs/acs1/variables/B17015_001E.json (Caused by NameResolutionError(""<urllib3.connection.HTTPSConnection object at 0x15f8a8890>: Failed to resolve 'api.census.gov' ([Errno 8] nodename nor servname provided, or not known)""))"
1,../data/data_extraction/GroupNodeChunks/GroupNode_chunk_064.csv,1,B17015_001EA,https://api.census.gov/data/2013/acs/acs1/variables/B17015_001EA.json,"HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2013/acs/acs1/variables/B17015_001EA.json (Caused by NameResolutionError(""<urllib3.connection.HTTPSConnection object at 0x15f8a93d0>: Failed to resolve 'api.census.gov' ([Errno 8] nodename nor servname provided, or not known)""))"
2,../data/data_extraction/GroupNodeChunks/GroupNode_chunk_064.csv,2,B17015_001M,https://api.census.gov/data/2013/acs/acs1/variables/B17015_001M.json,"HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2013/acs/acs1/variables/B17015_001M.json (Caused by NameResolutionError(""<urllib3.connection.HTTPSConnection object at 0x15f8aa110>: Failed to resolve 'api.census.gov' ([Errno 8] nodename nor servname provided, or not known)""))"
3,../data/data_extraction/GroupNodeChunks/GroupNode_chunk_064.csv,3,B17015_001MA,https://api.census.gov/data/2013/acs/acs1/variables/B17015_001MA.json,"HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2013/acs/acs1/variables/B17015_001MA.json (Caused by NameResolutionError(""<urllib3.connection.HTTPSConnection object at 0x15f8aae50>: Failed to resolve 'api.census.gov' ([Errno 8] nodename nor servname provided, or not known)""))"
4,../data/data_extraction/GroupNodeChunks/GroupNode_chunk_064.csv,4,B17015_002E,https://api.census.gov/data/2013/acs/acs1/variables/B17015_002E.json,"HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2013/acs/acs1/variables/B17015_002E.json (Caused by NameResolutionError(""<urllib3.connection.HTTPSConnection object at 0x15f8abb90>: Failed to resolve 'api.census.gov' ([Errno 8] nodename nor servname provided, or not known)""))"


## Summary Statistics of all Survey-Group Variables
- This function counts the total number of variables across all processed chunk files in a specified directory.
- It uses a progress tracker to display real-time progress while iterating through the files.
- At the end, it outputs the total number of variables and the number of chunk files processed.


In [182]:
from progress_tracker import track_progress
import os
import pandas as pd

# Directory containing the chunk files
chunk_directory = "../data/data_extraction/GroupNodesWithVariables"
chunk_file_prefix = "Processed_GroupNode_chunk_"
chunk_file_suffix = ".csv"

# Initialize variables
total_variables = 0
chunk_count = 0

# Get a list of chunk files
chunk_files = [
    f for f in os.listdir(chunk_directory)
    if f.startswith(chunk_file_prefix) and f.endswith(chunk_file_suffix)
]

# Process each chunk file with a progress tracker
for file_name in track_progress(chunk_files, description="Counting Variables"):
    file_path = os.path.join(chunk_directory, file_name)
    
    # Read the file and count the rows (excluding header)
    chunk_df = pd.read_csv(file_path, low_memory=False)
    row_count = len(chunk_df)
    total_variables += row_count
    chunk_count += 1

# Print the total count with number of chunks
print(f"Total variables across {chunk_count} chunk files: {total_variables}")


Counting Variables: 100%|██████████████████| 134/134 [00:50<00:00,  2.65items/s]

Total variables across 134 chunk files: 7696043





## 3.6 Build Table: SurveyVariablesNoGroupNode 
- The variables link has a table with variables in two catagories: GroupID or N/A
- We have built the extensive tables with variables in groupd tied to the surveyID in 3.5 above
- We must now gather all data on variable linked to SurveyID with group in the variables table where = N/A

**Note on Knowledge Graph Construction**: Surveys will have a group node and a no-group node. Then all variables will be the union of the two groups. We could treat 'no group' as a special kind of group. Haven't decided yet. 

In [139]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from progress_tracker import track_progress
import logging

# Configure logging for failures
logging.basicConfig(
    filename="processing_failures.log",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def process_html_to_extract_rows(row):
    """Fetch HTML and extract rows where Group = N/A, then handle attributes."""
    variable_list_url = row.get("Variable List")
    survey_id = row.get("SurveyID")

    if not isinstance(variable_list_url, str) or not variable_list_url.endswith(".html"):
        logging.error(f"Invalid or missing URL for SurveyID {survey_id}, URL: {variable_list_url}")
        return []

    try:
        # Fetch the HTML content
        response = requests.get(variable_list_url, timeout=10)

        if response.status_code != 200:
            logging.error(f"Failed to fetch URL {variable_list_url} for SurveyID {survey_id}, Status: {response.status_code}")
            return []

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        table_rows = soup.find_all('tr')
        extracted_rows = []

        for tr in table_rows:
            columns = [td.get_text(strip=True) for td in tr.find_all('td')]
            if len(columns) == 8:  # Ensure expected number of columns
                group = columns[7]
                
                # Filter: Process only rows where Group == "N/A"
                if group == "N/A":
                    attributes = columns[4] if columns[4] else "N/A"  # Handle blank attributes
                    extracted_rows.append({
                        "SurveyID": survey_id,
                        "Variable Name": columns[0],
                        "Label": columns[1],
                        "Concept": columns[2],
                        "Required": columns[3],
                        "Attributes": attributes,
                        "Limit": columns[5],
                        "Predicate Type": columns[6],
                        "Group": group
                    })
        return extracted_rows
    except Exception as e:
        logging.error(f"Error processing URL {variable_list_url} for SurveyID {survey_id}: {e}")
        return []

def process_survey_variables(df):
    """Process the Variable List column and extract SurveyVariablesNoGroupNode."""
    no_group_nodes = []

    for _, row in track_progress(df.iterrows(), description="Processing Variable List", total=len(df)):
        extracted_rows = process_html_to_extract_rows(row)
        no_group_nodes.extend(extracted_rows)

    return no_group_nodes

# Main Script
survey_df = pd.read_csv("../data/data_extraction/SurveyNode-ex-srt-geo-grp.csv")
no_group_nodes = process_survey_variables(survey_df)

# Save the SurveyVariablesNoGroupNode table
if no_group_nodes:
    no_group_df = pd.DataFrame(no_group_nodes)
    no_group_df.to_csv("../data/data_extraction/SurveyVariablesNoGroupNode.csv", index=False)
    print(f"Processed {len(no_group_nodes)} rows successfully!")
else:
    print("No rows were processed successfully.")

print("Survey Variable processing completed successfully!")


Processing Variable List: 100%|██████████| 1648/1648 [20:04<00:00,  1.37items/s]


Processed 539733 rows successfully!
Survey Variable processing completed successfully!


# Other file summary stats stuff

In [189]:
# Count variables in a file (aka number of data rows)
import pandas as pd

def count_variables_in_files(file_paths):
    """
    Counts the number of variables (excluding headers) for each file in the list.
    
    Args:
        file_paths (list): List of file paths to process.
        
    Returns:
        dict: A dictionary with file paths as keys and variable counts as values.
    """
    counts = {}
    for file_path in file_paths:
        try:
            df = pd.read_csv(file_path)
            counts[file_path] = len(df)
        except Exception as e:
            counts[file_path] = f"Error: {e}"
    return counts

file_paths = [
    "../data/data_extraction/SurveyNode.csv",
    "../data/data_extraction/SurveyGroupNode.csv",
    "../data/data_extraction/SurveyVariablesNoGroupNode.csv"
]

variable_counts = count_variables_in_files(file_paths)
for file, count in variable_counts.items():
    print(f"{file}: {count} variables")



../data/data_extraction/SurveyNode.csv: 1648 variables
../data/data_extraction/SurveyGroupNode.csv: 64873 variables
../data/data_extraction/SurveyVariablesNoGroupNode.csv: 539733 variables


  df = pd.read_csv(file_path)


# 3. Clean the Data

## **Data Structuring and Relationship Design**

### **Overview**
The goal of structuring the data is to create a hierarchical and navigable knowledge graph that balances usability and precision. This involves grouping repeated surveys into logical parent-child relationships while maintaining links to their associated variables and years.

### **Key Design Decisions**
1. **Parent-Child Relationships for Datasets:**
   - **Why:** Many surveys are conducted multiple times per year, with variations in their metadata. A parent-child hierarchy simplifies navigation for users querying high-level information while retaining granularity for specific queries.
   - **How:** We use the `parent_dataset` field to represent the high-level grouping (e.g., `cps` for the Current Population Survey) and `dataset_name` for specific instances (e.g., `cps/basic/jan` for the January survey).

2. **Linking Variables to Child Datasets:**
   - **Why:** Each dataset instance includes specific variables. Establishing this connection allows users to query datasets for their variables or find which datasets a variable belongs to.
   - **How:** We create `Variable` nodes linked to `ChildDataset` nodes via an `INCLUDES` relationship.

3. **Year-Based Relationships:**
   - **Why:** Many datasets are time-specific. Linking datasets to their respective years ensures queries can filter datasets by year and handle temporal questions like, "What data is available for 1986?"
   - **How:** We create `Year` nodes and connect them to `ChildDataset` nodes via a `BELONGS_TO_YEAR` relationship.

### **Graph Schema**
Here is the schema we use to represent the relationships:
- **ParentDataset**: Represents high-level groupings of surveys (e.g., `cps`, `cbp`).
  - **Relationships:**
    - `PARENT_OF` → `ChildDataset`
- **ChildDataset**: Represents individual survey instances (e.g., `cps/basic/jan`).
  - **Relationships:**
    - `INCLUDES` → `Variable`
    - `BELONGS_TO_YEAR` → `Year`
- **Variable**: Represents specific data variables (e.g., `employment_status`).
- **Year**: Represents the temporal context for datasets (e.g., `1986`).

### **Why This Structure?**
This design ensures:
- **Scalability**: Easily add new datasets, variables, and years.
- **Usability**: Queries can target high-level overviews or specific details.
- **Flexibility**: Supports both general and granular user queries.


In [78]:
import pandas as pd

def restructure_for_graph(datasets_metadata, variables_metadata):
    """
    Restructure datasets and variables metadata for hierarchical knowledge graph.
    Args:
        datasets_metadata (pd.DataFrame): Metadata for datasets.
        variables_metadata (pd.DataFrame): Metadata for variables.
    Returns:
        dict: Nodes and relationships for building the graph.
    """
    # Step 1: Extract unique surveys
    surveys = datasets_metadata['dataset_name'].str.split('/').str[0].unique()
    survey_nodes = pd.DataFrame({'survey': surveys})
    print(f"Survey Nodes: {survey_nodes.shape[0]}")

    # Step 2: Create dataset nodes
    datasets_metadata['parent_survey'] = datasets_metadata['dataset_name'].str.split('/').str[0]
    datasets_metadata['month'] = datasets_metadata['title'].str.extract(r'(\bJan|\bFeb|\bMar|\bApr|\bMay|\bJun|\bJul|\bAug|\bSep|\bOct|\bNov|\bDec)', expand=False)

    dataset_nodes = datasets_metadata[['parent_survey', 'year', 'month', 'title', 'description']].drop_duplicates()
    dataset_nodes['dataset_id'] = dataset_nodes.apply(lambda x: f"{x['parent_survey']}_{x['year']}_{x['month'] or 'Annual'}", axis=1)
    print(f"Dataset Nodes: {dataset_nodes.shape[0]}")

    # Step 3: Create variable nodes
    variables_metadata['parent_survey'] = variables_metadata['dataset_name'].str.split('/').str[0]
    variable_nodes = variables_metadata[['parent_survey', 'dataset_name', 'year', 'variable_name', 'label', 'concept']].drop_duplicates()
    print(f"Variable Nodes: {variable_nodes.shape[0]}")

    # Step 4: Create relationships
    survey_to_dataset = dataset_nodes[['parent_survey', 'dataset_id']]
    dataset_to_variable = variable_nodes[['dataset_name', 'variable_name']]
    print(f"Survey-to-Dataset Relationships: {survey_to_dataset.shape[0]}")
    print(f"Dataset-to-Variable Relationships: {dataset_to_variable.shape[0]}")

    return {
        'survey_nodes': survey_nodes,
        'dataset_nodes': dataset_nodes,
        'variable_nodes': variable_nodes,
        'relationships': {
            'survey_to_dataset': survey_to_dataset,
            'dataset_to_variable': dataset_to_variable
        }
    }

# Reload and clean metadata
print("Reloading datasets_metadata and variables_metadata...")
datasets_metadata = pd.read_csv('./data/census_datasets_metadata.csv')
variables_metadata = pd.read_csv('./data/combined_variables_metadata.csv')
print("Metadata loaded.")

# Restructure for graph
graph_data = restructure_for_graph(datasets_metadata, variables_metadata)

# Extract nodes and relationships
survey_nodes = graph_data['survey_nodes']
dataset_nodes = graph_data['dataset_nodes']
variable_nodes = graph_data['variable_nodes']
survey_to_dataset_relationships = graph_data['relationships']['survey_to_dataset']
dataset_to_variable_relationships = graph_data['relationships']['dataset_to_variable']

# Save to CSV (optional)
survey_nodes.to_csv('./data/survey_nodes_debug.csv', index=False)
dataset_nodes.to_csv('./data/dataset_nodes_debug.csv', index=False)
variable_nodes.to_csv('./data/variable_nodes_debug.csv', index=False)
survey_to_dataset_relationships.to_csv('./data/survey_to_dataset_relationships_debug.csv', index=False)
dataset_to_variable_relationships.to_csv('./data/dataset_to_variable_relationships_debug.csv', index=False)

print("Debug files saved:")
print("- survey_nodes_debug.csv")
print("- dataset_nodes_debug.csv")
print("- variable_nodes_debug.csv")
print("- survey_to_dataset_relationships_debug.csv")
print("- dataset_to_variable_relationships_debug.csv")


Reloading datasets_metadata and variables_metadata...


  variables_metadata = pd.read_csv('./data/combined_variables_metadata.csv')


Metadata loaded.
Survey Nodes: 298
Dataset Nodes: 3954
Variable Nodes: 2693973
Survey-to-Dataset Relationships: 3954
Dataset-to-Variable Relationships: 2693973
Debug files saved:
- survey_nodes_debug.csv
- dataset_nodes_debug.csv
- variable_nodes_debug.csv
- survey_to_dataset_relationships_debug.csv
- dataset_to_variable_relationships_debug.csv


In [80]:
# Unique top-level survey names
unique_surveys = datasets_metadata['dataset_name'].str.split('/').str[0].unique()
print("Unique Surveys:")
for survey in unique_surveys:
    print(survey)

# Count occurrences of each survey
survey_counts = datasets_metadata['dataset_name'].str.split('/').str[0].value_counts()
print("\nSurvey Counts:")
print(survey_counts)

Unique Surveys:
cps
basic
jun
cbp
zbp
mar
apr
pep
int_charagegroups
aug
int_natcivpop
int_natresafo
dec
int_natrespop
may
ewks
jan
jul
feb
nov
oct
sep
nonemp
int_charage
int_housingunits
int_natmonthly
int_population
surname
acs
acs1
cprofile
pums
acs5
plnat
profile
subject
flows
cd113
cd115
sf1
sf2
ecnbridge2
ecnadmben
ecnbranddeal
ecnbridge1
ecncashadv
ecnbrordeal
ecnccard
ecninvval
ecnclcust
ecnipa
ecncomm
ecnkob
ecncomp
ecnlabor
ecnfran
ecnconact
ecnconcess
ecnlifomfg
ecncrfin
ecnlifomine
ecngrant
ecndissmed
ecnempfunc
ecnlifoval
ecnentsup
ecnlines
ecnguest
ecneoyinv
ecneoyinvwh
ecnloan
ecnequip
ecnlocmfg
ecnguestsize
ecnexpnrg
ecnexpsvc
ecnlocmine
ecnflspace
ecnmargin
ecnfoodsvc
ecnhosp
ecnmatfuel
ecnmealcost
ecnmenutype
ecnpatient
ecnpetrfac
ecnpetrprod
ecnpetrrec
ecnpetrstat
ecnprofit
ecnpurelec
ecnpurmode
ecnrdacq
ecnrdofc
ecnseat
ecnsize
ecnsocial
ecntype
ecntypop
ecnvalcon
sbo
cscbo
popproj
births
deaths
nim
pop
pubschlfin
cre
cscb
language
cochar5
cochar6
cty
housing
monthly

In [81]:
# Define surveys to remove
surveys_to_remove = [
    'basic', 'jan', 'feb', 'mar', 'apr', 'may', 
    'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
]

# Filter out the redundant entries
datasets_metadata = datasets_metadata[
    ~datasets_metadata['dataset_name'].str.split('/').str[0].isin(surveys_to_remove)
]

# Recheck the unique surveys after removal
remaining_surveys = datasets_metadata['dataset_name'].str.split('/').str[0].unique()
print("\nRemaining Surveys After Filtering:")
print(remaining_surveys)

# Save the cleaned dataset for further inspection
datasets_metadata.to_csv('./data/cleaned_datasets_metadata.csv', index=False)
print("\nFiltered datasets saved to './data/cleaned_datasets_metadata.csv'")



Remaining Surveys After Filtering:
['cps' 'cbp' 'zbp' 'pep' 'int_charagegroups' 'int_natcivpop'
 'int_natresafo' 'int_natrespop' 'ewks' 'nonemp' 'int_charage'
 'int_housingunits' 'int_natmonthly' 'int_population' 'surname' 'acs'
 'acs1' 'cprofile' 'pums' 'acs5' 'plnat' 'profile' 'subject' 'flows'
 'cd113' 'cd115' 'sf1' 'sf2' 'ecnbridge2' 'ecnadmben' 'ecnbranddeal'
 'ecnbridge1' 'ecncashadv' 'ecnbrordeal' 'ecnccard' 'ecninvval'
 'ecnclcust' 'ecnipa' 'ecncomm' 'ecnkob' 'ecncomp' 'ecnlabor' 'ecnfran'
 'ecnconact' 'ecnconcess' 'ecnlifomfg' 'ecncrfin' 'ecnlifomine' 'ecngrant'
 'ecndissmed' 'ecnempfunc' 'ecnlifoval' 'ecnentsup' 'ecnlines' 'ecnguest'
 'ecneoyinv' 'ecneoyinvwh' 'ecnloan' 'ecnequip' 'ecnlocmfg' 'ecnguestsize'
 'ecnexpnrg' 'ecnexpsvc' 'ecnlocmine' 'ecnflspace' 'ecnmargin'
 'ecnfoodsvc' 'ecnhosp' 'ecnmatfuel' 'ecnmealcost' 'ecnmenutype'
 'ecnpatient' 'ecnpetrfac' 'ecnpetrprod' 'ecnpetrrec' 'ecnpetrstat'
 'ecnprofit' 'ecnpurelec' 'ecnpurmode' 'ecnrdacq' 'ecnrdofc' 'ecnseat'
 'ecn

In [82]:
# Ensure `parent_survey` column exists in the datasets_metadata
datasets_metadata['parent_survey'] = datasets_metadata['dataset_name'].str.split('/').str[0]

# Filter rows where parent_survey equals dataset_name
datasets_metadata = datasets_metadata[datasets_metadata['parent_survey'] == datasets_metadata['dataset_name']]

# Check the resulting DataFrame
print("\nFiltered Datasets Metadata:")
print(datasets_metadata.head())

# Save the filtered dataset to a new file for further inspection
datasets_metadata.to_csv('./data/cleaned_datasets_metadata.csv', index=False)
print("\nFiltered datasets saved to './data/cleaned_datasets_metadata.csv'")



Filtered Datasets Metadata:
  dataset_name  year                                              title  \
0          cps  1994  Jun 1994 Current Population Survey: Basic Monthly   
3          cbp  1986   1986 County Business Patterns: Business Patterns   
4          zbp  1994  1994 County Business Patterns - Zip Code Busin...   
5          cbp  1987   1987 County Business Patterns: Business Patterns   
6          cbp  1995   1995 County Business Patterns: Business Patterns   

                                         description  \
0  To provide estimates of employment, unemployme...   
3  County Business Patterns (CBP) is an annual se...   
4  ZIP Code Business Patterns (ZBP) is an annual ...   
5  County Business Patterns (CBP) is an annual se...   
6  County Business Patterns (CBP) is an annual se...   

                                      identifier    contact access_level  \
0  https://api.census.gov/data/id/CPSBASIC199406  CPS Staff       public   
3          http://api.census.go

# 4. Connect to Neo4j
- This cell sets up the Neo4j connection.

In [48]:
from neo4j import GraphDatabase

# Initialize Neo4j connection (replace with your credentials)
neo4j_uri = "bolt://localhost:7687"  # Update with your Neo4j URI
neo4j_user = "neo4j"
neo4j_password = "password"  # Update with your password

try:
    # Create a driver instance
    driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
    
    # Test connection by opening a session and executing a simple query
    with driver.session() as session:
        session.run("RETURN 1")  # Simple query to check connection
    print("Connection successful.")
except Exception as e:
    print(f"Connection failed: {e}")


Connection successful.


# 5. Ingest Data into Neo4j
- This cell contains the ingestion logic.

## Clear the Neo4J dB if necessary
To delete all the data in your Neo4j database, you can use the following Cypher query, which will remove all nodes and relationships: \
> MATCH (n) \
> DETACH DELETE n

### Explanation:
- MATCH (n): This matches all nodes in the graph.
- DETACH DELETE n: This deletes the nodes and any relationships attached to them.

### How to Run:
1. Open your Neo4j browser or a Neo4j client.
1. Paste the query and execute it.

This will completely clear your Neo4j database of all nodes and relationships, giving you a fresh starting point for your new data ingestion.



In [68]:
from neo4j import GraphDatabase
import pandas as pd
from tqdm import tqdm

# Initialize Neo4j connection
neo4j_uri = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "password"
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))

# Load data from CSV files
print("Creating DataFrames...")
survey_nodes = pd.read_csv('./data/survey_nodes.csv')
dataset_nodes = pd.read_csv('./data/dataset_nodes.csv')
variable_nodes = pd.read_csv('./data/variable_nodes.csv')
survey_to_dataset_relationships = pd.read_csv('./data/survey_to_dataset_relationships.csv')
dataset_to_variable_relationships = pd.read_csv('./data/dataset_to_variable_relationships.csv')
print("DataFrames created. Starting Neo4j loading process...")

# Function to load nodes in batches with progress bar
def load_nodes_in_batches(driver, df, query, desc, batch_size=1000):
    total_batches = (len(df) + batch_size - 1) // batch_size
    with tqdm(total=len(df), desc=desc) as pbar:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i + batch_size]
            with driver.session() as session:
                for _, row in batch.iterrows():
                    session.run(query, **row.to_dict())
            pbar.update(len(batch))

# Function to load relationships in batches with progress bar
def load_relationships_in_batches(driver, df, query, desc, batch_size=1000):
    total_batches = (len(df) + batch_size - 1) // batch_size
    with tqdm(total=len(df), desc=desc) as pbar:
        for i in range(0, len(df), batch_size):
            batch = df.iloc[i:i + batch_size]
            with driver.session() as session:
                for _, row in batch.iterrows():
                    session.run(query, **row.to_dict())
            pbar.update(len(batch))

# Clear existing data
with driver.session() as session:
    print("Clearing existing data...")
    session.run("MATCH (n) DETACH DELETE n")
print("Existing data cleared.")

# Load Survey nodes
load_nodes_in_batches(
    driver,
    survey_nodes,
    """
    MERGE (s:Survey {survey: $survey})
    """,
    "Loading Survey Nodes"
)

# Load Dataset nodes
load_nodes_in_batches(
    driver,
    dataset_nodes,
    """
    MERGE (d:Dataset {dataset_id: $dataset_id})
    SET d.year = $year, d.month = $month, d.title = $title, d.description = $description
    """,
    "Loading Dataset Nodes"
)

# Load Variable nodes
load_nodes_in_batches(
    driver,
    variable_nodes,
    """
    MERGE (v:Variable {variable_name: $variable_name})
    SET v.label = $label, v.concept = $concept
    """,
    "Loading Variable Nodes"
)

# Create Survey -> Dataset relationships
load_relationships_in_batches(
    driver,
    survey_to_dataset_relationships,
    """
    MATCH (s:Survey {survey: $parent_survey})
    MATCH (d:Dataset {dataset_id: $dataset_id})
    MERGE (s)-[:HAS_DATASET]->(d)
    """,
    "Creating Survey -> Dataset Relationships"
)

# Create Dataset -> Variable relationships
load_relationships_in_batches(
    driver,
    dataset_to_variable_relationships,
    """
    MATCH (d:Dataset {dataset_id: $dataset_name})
    MATCH (v:Variable {variable_name: $variable_name})
    MERGE (d)-[:HAS_VARIABLE]->(v)
    """,
    "Creating Dataset -> Variable Relationships"
)

print("All data loaded into Neo4j successfully!")

Creating DataFrames...
DataFrames created. Starting Neo4j loading process...
Clearing existing data...
Existing data cleared.


Loading Survey Nodes: 100%|██████████████████| 298/298 [00:02<00:00, 109.76it/s]
Loading Dataset Nodes: 100%|████████████████| 3954/3954 [00:42<00:00, 94.01it/s]
Loading Variable Nodes:   0%|         | 7000/2693973 [01:03<6:44:00, 110.85it/s]


KeyboardInterrupt: 

# Step 6: Securely Loading OpenAI API Key and Using LLM for Concept Extraction

The goal of this step is to securely load the OpenAI API key from a `.env` file and utilize the LLM (Large Language Model) to enhance the knowledge graph. By extracting key concepts and terms from variable descriptions, we enrich the graph with semantic information.

**Key Steps:**
1. **Loading the API Key**: 
    - We load the OpenAI API key securely from a `.env` file using the `python-dotenv` package. This avoids hardcoding sensitive credentials in the source code, ensuring better security and flexibility.
   
2. **Using LLM for Concept Extraction**:
    - Once the API key is loaded, we use OpenAI’s GPT-based model to process the variable descriptions. The model extracts key concepts, terms, and entities from the descriptions (e.g., "income," "education level"), which we can then use to create new concept nodes in the knowledge graph.
   
3. **Enhancing the Knowledge Graph**:
    - After extracting the concepts, we create **concept nodes** and link them to the relevant **variables** using `:MEASURES` relationships. We can also link surveys that cover similar concepts, allowing us to analyze relationships between datasets based on the concepts they measure.

This approach leverages the power of GPT to enhance the graph beyond simple variable names, creating a richer, more semantically aware dataset that will be useful for querying, analysis, and discovering relationships that were not obvious at first glance.

## 6.1 Loading the API Key


In [None]:
import openai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the OpenAI API key from the environment
openai.api_key = os.getenv("OPENAI_API_KEY")

## 6.2 Testing functionality/connectivity of the GPT

### Extracting Key Concepts with ChatGPT
This is for testing, to be sure things are working. In this step, we'll define a function that uses OpenAI's GPT model to process variable descriptions and extract important concepts, keywords, or themes. The extracted concepts will help us create concept nodes in the knowledge graph.

### Here’s the process:
1. Input: We'll pass the variable descriptions to the GPT model via the OpenAI API.
1. Output: The model will return key concepts or keywords that are semantically related to the descriptions.
1. Linking: These concepts will be used to enrich the knowledge graph, creating concept nodes and linking them to the variables.

In [None]:
import openai

def extract_concepts_from_description(description):
    """
    Extract key concepts from a description using OpenAI's API
    
    Args:
        description (str): The text description to analyze
    
    Returns:
        str: Extracted concepts and terms
    """
    try:
        client = openai.OpenAI()  # Uses OPENAI_API_KEY from environment
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",  # Can upgrade to gpt-4 if available
            messages=[
                {"role": "system", "content": "You are a helpful assistant specializing in concept extraction."},
                {"role": "user", "content": f"Extract key concepts and terms from this variable description: {description}"}
            ],
            max_tokens=100,
            temperature=0.5
        )
        
        # Extract concepts from the response
        concepts = response.choices[0].message.content.strip()
        
        return concepts
    
    except Exception as e:
        print(f"Error in concept extraction: {e}")
        return None

# Example usage
if __name__ == "__main__":
    sample_description = "Flag for Production workers average for year"
    concepts = extract_concepts_from_description(sample_description)
    print("Extracted Concepts:", concepts)

  

## Step 6.3: Link Variables to Concepts

The goal here is to take the concepts extracted from the variable descriptions and link them to their corresponding variables in the Neo4j knowledge graph.


In [None]:
import openai
import pandas as pd
import logging
import os

# Configure logging
log_file_path = 'concept_extraction.log'
# Ensure the log file is cleared each time
if os.path.exists(log_file_path):
    os.remove(log_file_path)

logging.basicConfig(
    filename=log_file_path,
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s'
)

# Function to extract concepts from the variable description using OpenAI
def extract_concepts_from_description(description):
    try:
        response = openai.Completion.create(
            model="gpt-3.5-turbo",  # You can upgrade to gpt-4 if desired
            prompt=f"Extract key concepts and terms from this variable description: {description}",
            max_tokens=100,  # Limit tokens to get a concise response
            n=1,
            stop=None,
            temperature=0.3
        )
        concepts = response.choices[0].text.strip()
        return concepts
    except Exception as e:
        logging.error(f"Error in concept extraction for description '{description}': {e}")
        return None

# Main extraction process
def extract_missing_concepts(child_to_variable):
    logging.info("Starting concept extraction for missing entries")
    
    for index, row in child_to_variable.iterrows():
        if pd.isna(row['concept']) or row['concept'] == '':  # If concept is missing
            logging.info(f"Attempting to extract concept for variable {row['variable_name']}")
            
            # Attempt to extract the concept using the variable's label or description
            description = row['label']  # Or use another column if description is separate
            extracted_concepts = extract_concepts_from_description(description)
            
            if extracted_concepts:
                logging.info(f"Successfully linked Variable: {row['variable_name']} to Concept: {extracted_concepts}")
                child_to_variable.at[index, 'concept'] = extracted_concepts  # Assign the extracted concept
            else:
                logging.warning(f"Failed to extract concept for {row['variable_name']}")
    
    logging.info("Concept extraction process completed")
    return child_to_variable

# Actual execution
if __name__ == "__main__":
    # Ensure OpenAI API key is set
    if not os.getenv('OPENAI_API_KEY'):
        logging.error("OpenAI API key not set. Please set the OPENAI_API_KEY environment variable.")
        raise ValueError("OpenAI API key is required")

    # Process all data
    updated_child_to_variable = extract_missing_concepts(child_to_variable)
    
    # Save the updated DataFrame
    updated_child_to_variable.to_csv('updated_child_to_variable.csv', index=False)
    
    # Display updated entries
    print("Concepts successfully populated for missing entries:")
    print(updated_child_to_variable.head())

In [None]:
print(child_to_variable[child_to_variable['variable_name'] == 'EMPAVPW_F'])
