In [12]:
## INDUSTRY BY COUNTY DATA - Alanis Perez

In [26]:
import pandas as pd
import glob
import os
import json

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [42]:
# Define paths (input & output)
input_directory = 'data/OG_county'
output_directory = 'data/Industry_combined_data'

# Function to combine industry into categories
def industry_combine(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        stripped_line = line.strip()
        leading_spaces = len(line) - len(stripped_line)
        indentation_level = leading_spaces // 2  # Because there are 2 spaces per indent
        
        # Append the line and its indentation level to the data list
        data.append((stripped_line, indentation_level))

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=['Industry', 'Indentation Level'])

    # Create a new DataFrame to hold combined categories
    combined_data = []
    current_main_category = None
    current_subcategories = []

    for index, row in df.iterrows():
        industry = row['Industry']
        indentation_level = row['Indentation Level']

        if indentation_level == 3:  # Main category
            if current_main_category is not None:
                combined_data.append((current_main_category, ', '.join(current_subcategories)))

            current_main_category = industry  # Main category list
            current_subcategories = []  # Reset subcategory list

        elif indentation_level == 4:  # Subcategory
            current_subcategories.append(industry)

    if current_main_category is not None:
        combined_data.append((current_main_category, ', '.join(current_subcategories)))

    # Create new DF from the combined data
    combined_df = pd.DataFrame(combined_data, columns=['Category', 'Subcategories'])

    if combined_data:  # Check if combined_data is not empty
        return combined_df
    else:
        print("No main categories found.")
        return pd.DataFrame()  # In case any data frames come back empty

# Process each CSV file (county) in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        combined_df = industry_combine(file_path)

        # Save the combined DataFrame to a new CSV file in the output directory
        output_file_path = os.path.join(output_directory, f'combined_{filename}')
        combined_df.to_csv(output_file_path, index=False)
print("Processing complete. Combined files saved in 'Industry_combined_data' directory.")

Processing complete. Combined files saved in 'Industry_combined_data' directory.


In [44]:
# Define paths (input & output)
input_directory = 'data/Industry_combined_data'
output_directory = 'data/Industry_transformed_data'

# Function to transform/split data from categorization done using the previous function
def transform_data(file_path):
    # Load the combined data
    df = pd.read_csv(file_path)

    # Split the 'Main Category' column
    categories_split = df['Category'].str.split(',', expand=True)

    # Create new column names (one for each year from 2010 to 2024)
    year_columns = [f"{year}" for year in range(2010, 2025)]

    # Create data frame with the proper columns for our data
    transformed_df = pd.DataFrame(columns=["Category"] + year_columns)

    # Iterate through each row in the split categories
    for index, row in categories_split.iterrows():
        # The first value is the Main Category
        main_category = row[0]
        
        # The remaining values are the averages
        averages = row[1:].tolist()
        
        # Fill the remaining values with NaN in case there are fewer than 15
        while len(averages) < 15:
            averages.append(float('nan'))  # Append NaN in case of gaps/blanks
        
        # Create a new row for the transformed DataFrame
        transformed_df.loc[index] = [main_category] + averages[:15]  # Takes the first 15 values

    return transformed_df

# Process each CSV file (combined_data) in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_directory, filename)
        transformed_df = transform_data(file_path)

        # Save the transformed DataFrame to a new CSV file in the output directory
        output_file_path = os.path.join(output_directory, f'transformed_{filename}')
        transformed_df.to_csv(output_file_path, index=False)

print("Transformation complete. Transformed files saved in 'Industry_transformed_data' directory.")

Transformation complete. Transformed files saved in 'Industry_transformed_data' directory.


In [46]:
# CREATE DATABASE TO HOLD ALL THIS DATA !
# Create a list to hold files for all 58 counties
dataframes = []

# Loop through all CSV files in the directory
for county in glob.glob("data/Industry_transformed_data/*.csv"):
    
    # Read the CSV file
    county_df = pd.read_csv(county)
    
    # Extract county name from the filename, assign name of county
    County = county.split("/")[-1].replace(".csv", "").replace("Industry_transformed_data\\transformed_combined_", "").replace("_county", "").replace("_", " ")
    county_df['County'] = County
    
    # Append the DataFrame to the list
    dataframes.append(county_df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

In [48]:
combined_df['Category'] = combined_df['Category'].str.replace('"', '')
combined_df.head()

Unnamed: 0,Category,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,County
0,Civilian Unemployment,87333.33,80525.0,70333.33,58625.0,47500.0,39108.33,35791.67,31075.0,25733.33,25050.0,72991.67,49375.0,27458.33,33658.33,51126.39,Alameda
1,Total Farm,733.33,700.0,658.33,550.0,500.0,450.0,475.0,608.33,616.67,666.67,725.0,966.67,850.0,791.67,628.33,Alameda
2,Total Nonfarm,652858.33,661741.67,682141.67,701075.0,720133.33,748983.33,771433.33,789691.67,806833.33,814483.33,751225.0,776116.67,804050.0,812791.67,683590.0,Alameda
3,Mining Logging and Construction,30308.33,30833.33,33283.33,35550.0,37600.0,40875.0,43041.67,45925.0,49041.67,49591.67,46650.0,48675.0,48416.67,47391.67,33515.0,Alameda
4,Manufacturing,60150.0,62141.67,62325.0,64641.67,67508.33,72466.67,75008.33,79941.67,84716.67,85033.33,83658.33,91841.67,98283.33,97750.0,63353.33,Alameda


In [50]:
combined_df.to_json("data/industry_county_json.json", orient="records")