In [44]:
## INDUSTRY BY COUNTY DATA - Alanis Perez

In [46]:
import pandas as pd
import glob
import os

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [48]:
# Define paths (input & output)
input_directory = 'data/OG_county'
output_directory = 'data/Industry_combined_data'

# Function to combine industry into categories
def industry_combine(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Find the index of "Total, All Industries" (SKIP ALL ROWS PRIOR THIS LINE!)
    start_index = None
    for i, line in enumerate(lines):
        if "Total, All Industries" in line:
            start_index = i + 1  # Start AFTER "Total, All Industries"
            break

    if start_index is not None:
        data = []
        for line in lines[start_index:]:
            stripped_line = line.strip()
            leading_spaces = len(line) - len(stripped_line)
            indentation_level = leading_spaces // 2  # There are 2 spaces per indentation
            
            # Append the line and its indentation level to the data list
            data.append((stripped_line, indentation_level))

        # Create a DataFrame from the data
        df = pd.DataFrame(data, columns=['Industry', 'Indentation Level'])

        # Filter the DataFrame to include only rows with indentation levels of 3 or greater
        df_filtered = df[df['Indentation Level'] >= 3] # Anything below 3 indents will not be a good representation of the category

        # Create a new DataFrame to hold combined categories
        combined_data = []
        current_main_category = None
        current_subcategories = []

        for index, row in df_filtered.iterrows():
            industry = row['Industry']
            indentation_level = row['Indentation Level']

            if indentation_level == 3: # This will be the main category
                if current_main_category is not None:
                    combined_data.append((current_main_category, ', '.join(current_subcategories)))

                current_main_category = industry # Start the new main category
                current_subcategories = []  # Reset subcategories list

            elif indentation_level == 4: # This will be the subcategory
                current_subcategories.append(industry)

        if current_main_category is not None:
            combined_data.append((current_main_category, ', '.join(current_subcategories)))

        # Create new DF from the combined data
        combined_df = pd.DataFrame(combined_data, columns=['Main Category', 'Subcategories'])

        return combined_df
    else:
        print("No main categories found.")
        return pd.DataFrame()  # In case any data frames come back empty

# Process each CSV file (county) in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        combined_df = industry_combine(file_path)

        # Save the combined DataFrame to a new CSV file in the output directory
        output_file_path = os.path.join(output_directory, f'combined_{filename}')
        combined_df.to_csv(output_file_path, index=False)

print("Processing complete. Combined files saved in 'Industry_combined_data' directory.")

Processing complete. Combined files saved in 'Industry_combined_data' directory.


In [50]:
# Define paths (input & output)
input_directory = 'data/Industry_combined_data'
output_directory = 'data/Industry_transformed_data'

# Function to transform/split data from categorization done using the previous function
def transform_data(file_path):
    # Load the combined data
    df = pd.read_csv(file_path)

    # Split the 'Main Category' column
    categories_split = df['Main Category'].str.split(',', expand=True)

    # Create new column names (one for each year from 2010 to 2024)
    year_columns = [f"{year}_Average" for year in range(2010, 2025)]

    # Create data frame with the proper columns for our data
    transformed_df = pd.DataFrame(columns=["Main Category"] + year_columns)

    # Iterate through each row in the split categories
    for index, row in categories_split.iterrows():
        # The first value is the Main Category
        main_category = row[0]
        
        # The remaining values are the averages
        averages = row[1:].tolist()
        
        # Fill the remaining values with NaN in case there are fewer than 15
        while len(averages) < 15:
            averages.append(float('nan'))  # Append NaN in case of gaps/blanks
        
        # Create a new row for the transformed DataFrame
        transformed_df.loc[index] = [main_category] + averages[:15]  # Takes the first 15 values

    return transformed_df

# Process each CSV file (combined_data) in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        transformed_df = transform_data(file_path)

        # Save the transformed DataFrame to a new CSV file in the output directory
        output_file_path = os.path.join(output_directory, f'transformed_{filename}')
        transformed_df.to_csv(output_file_path, index=False)

print("Transformation complete. Transformed files saved in 'Industry_transformed_data' directory.")

Transformation complete. Transformed files saved in 'Industry_transformed_data' directory.


In [80]:
# CREATE DATABASE TO HOLD ALL THIS DATA !
# Create a list to hold files for all 58 counties
dataframes = []

# Loop through all CSV files in the directory
for county in glob.glob("data/Industry_transformed_data/*.csv"):
    
    # Read the CSV file
    county_df = pd.read_csv(county)
    
    # Extract county name from the filename, assign county ID to be name of county
    county_id = county.split("/")[-1].replace(".csv", "").replace("Industry_transformed_data\\transformed_combined_", "").replace("_county", "").replace("_", " ")
    county_df['County_ID'] = county_id
    
    # Append the DataFrame to the list
    dataframes.append(county_df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

In [82]:
combined_df.to_json("data/industry_county_json.json", orient="records")