In [2]:
## INDUSTRY BY COUNTY DATA - Alanis Perez

In [28]:
import pandas as pd
import glob
import os

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [170]:
# Function to process a single CSV file
def indent_count(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Find the index of the line containing "Total, All Industries"
    start_index = None
    for i, line in enumerate(lines):
        if "Total, All Industries" in line:
            start_index = i + 1  # Start after this line
            break

    # If "Total, All Industries" was found, process lines from that point onward
    if start_index is not None:
        data = []
        for line in lines[start_index:]:  # Process only from the identified start index
            stripped_line = line.strip()
            leading_spaces = len(line) - len(stripped_line)
            indentation_level = leading_spaces // 2  # Assuming 2 spaces per indentation
            
            # Append the line and its indentation level to the data list
            data.append((stripped_line, indentation_level))

        # Create a DataFrame from the data
        df = pd.DataFrame(data, columns=['Industry', 'Indentation Level'])

        # Filter the DataFrame to include only rows with indentation levels of 3 or greater
        df_filtered = df[df['Indentation Level'] >= 3]

        # Group industries based on indentation level
        df_filtered['Main Category'] = df_filtered.apply(lambda x: x['Industry'] if x['Indentation Level'] == 4 else None, axis=1)
        df_filtered['Main Category'].fillna(method='ffill', inplace=True)  # Forward fill to assign main category to subcategories

        return df_filtered
    else:
        print("No main categories found.")
        return pd.DataFrame()  # Return an empty DataFrame if not found

In [172]:
# # Call indent_count function on all counties
alpine_path = 'data/OG_county/Alpine_county.csv'
# _path = 'data/OG_county/_county.csv'

alpine_industries = indent_count(alpine_path)
print(alpine_industries)

No main categories found.
Empty DataFrame
Columns: []
Index: []


In [162]:
# Function to process a single CSV file
def industry_combine(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Find the index of the line containing "Total, All Industries"
    start_index = None
    for i, line in enumerate(lines):
        if "Total, All Industries" in line:
            start_index = i + 1  # Start after this line
            break

    # If "Total, All Industries" was found, process lines from that point onward
    if start_index is not None:
        data = []
        for line in lines[start_index:]:  # Process only from the identified start index
            stripped_line = line.strip()
            leading_spaces = len(line) - len(stripped_line)
            indentation_level = leading_spaces // 2  # Assuming 2 spaces per indentation
            
            # Append the line and its indentation level to the data list
            data.append((stripped_line, indentation_level))

        # Create a DataFrame from the data
        df = pd.DataFrame(data, columns=['Industry', 'Indentation Level'])

        # Filter the DataFrame to include only rows with indentation levels of 3 or greater
        df_filtered = df[df['Indentation Level'] >= 3]

        # Create a new DataFrame to hold combined categories
        combined_data = []
        current_main_category = None
        current_subcategories = []

        for index, row in df_filtered.iterrows():
            industry = row['Industry']
            indentation_level = row['Indentation Level']

            if indentation_level == 3:
                # If we were already tracking a main category, save it before starting a new one
                if current_main_category is not None:
                    combined_data.append((current_main_category, ', '.join(current_subcategories)))

                # Start a new main category
                current_main_category = industry
                current_subcategories = []  # Reset subcategories list

            elif indentation_level == 4:
                # Add subcategory to the current main category
                current_subcategories.append(industry)

        # Don't forget to add the last main category and its subcategories
        if current_main_category is not None:
            combined_data.append((current_main_category, ', '.join(current_subcategories)))

        # Create a new DataFrame from the combined data
        combined_df = pd.DataFrame(combined_data, columns=['Main Category', 'Subcategories'])

        return combined_df
    else:
        print("No main categories found.")
        return pd.DataFrame()  # Return an empty DataFrame if not found

In [166]:
# Call industry_combine function on all counties
directory_path = 'data/OG_county'
output_directory = 'data/Industry_combined_data'

# List to store names of files with no main categories found
no_categories_files = []

# Iterate through all files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        combined_data = industry_combine(file_path)
        
        if combined_data is None or combined_data.empty:  # Check if no main categories found
            no_categories_files.append(filename)
        else:
            # Create a new CSV file for each combined result
            output_file_path = os.path.join(output_directory, f'combined_{filename}')
            combined_data.to_csv(output_file_path, index=False)

# Print the list of files with no main categories found
if no_categories_files:
    print("Files with no main categories found:")
    for file in no_categories_files:
        print(file)
else:
    print("All files had main categories found.")

# riverside_path = 'data/OG_county/Riverside_county.csv'
# Riverside_industries_combined = industry_combine(riverside_path)

# # Display the processed DataFrame
# print(Riverside_industries_combined)

# # Save the combined DataFrame to a CSV file
# Riverside_industries_combined.to_csv('data/Industry_combined_data/Riverside_industries_combined.csv', index=False)

No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
No main categories found.
Files with no main categories found:
Alpine_county.csv
Glenn_county.csv
Imperial_county.csv
Kings_county.csv
Lake_county.csv
Mariposa_county.csv
Mendocino_county.csv
Mono_county.csv
Nevada_county.csv
Plumas_county.csv
SanJoaquin_county.csv
Sierra_county.csv
Solano_county.csv
Sonoma_county.csv


In [140]:
# Function to process the combined DataFrame
def transform_combined_data(file_path):
    # Load the combined data
    df = pd.read_csv(file_path)

    # Split the 'Main Category' column into separate columns
    # This assumes the format is "Industry Name,numerical values"
    categories_split = df['Main Category'].str.split(',', expand=True)

    # Create new column names for years 2010 to 2024
    year_columns = [f"{year}_Average" for year in range(2010, 2025)]

    # Prepare the new DataFrame with the required columns
    transformed_df = pd.DataFrame(columns=["Main Category"] + year_columns)

    # Iterate through each row in the split categories
    for index, row in categories_split.iterrows():
        # The first value is the Main Category
        main_category = row[0]
        
        # The remaining values are the averages
        averages = row[1:].tolist()
        
        # Fill the remaining values with NaN if there are fewer than 15
        while len(averages) < 15:
            averages.append(float('nan'))  # Append NaN to fill the gap
        
        # Create a new row for the transformed DataFrame
        transformed_df.loc[index] = [main_category] + averages[:15]  # Only take the first 15 values

    # # Save the transformed DataFrame to a new CSV file
    # transformed_df.to_csv('Riverside_industries_transformed.csv', index=False)

    # return transformed_df

In [None]:
# Now you can call the function with the file path
file_path = 'Riverside_industries_combined.csv'
transformed_data = transform_combined_data(file_path)

# Display the transformed DataFrame
print(transformed_data)

# Save new csv files
Riverside_industries_combined.to_csv('data/Industry_data/Riverside_industries.csv', index=False)

In [33]:
# Create a list to hold files for all 58 counties
dataframes = []

# Loop through all CSV files in the directory
for county in glob.glob("data/Industry_data/*.csv"):
    # Read the CSV file
    
    county_df = pd.read_csv(county)
    
    # Extract county name from the filename, assign county ID to be name of county
    county_id = county.split("/")[-1].replace(".csv", "").replace("data\\", "").replace("_county", "")
    county_df['County_ID'] = county_id
    
    # Append the DataFrame to the list
    dataframes.append(county_df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Export to JSON
combined_df.to_json("data/industry_county.json", orient="records")

# NEXT !!!!!!
# industry_df = pd.read_json('industry_county.json')
# industry_df = industry_df.set_index("County_ID")
# industry_df.head()