In [None]:
import pickle
import pandas as pd
pd.set_option('display.width', 1000)

with open("../data/raw/gdp.pkl", "rb") as file:
  df = pickle.load(file)

## Preprocessing

- [x] Make dataset uniform to others
- [x] Define naics/fips filters for cleanup
- [x] Apply filters
- [x] Group for naics and fips

In [None]:
# Year array for later
years = ["2017", "2018", "2019", "2020", "2021", "2022"]

# Rename naics for uniformity
df.rename(columns={"IndustryClassification": "naics"}, inplace=True)

# Define redundant cols
redundant_to_drop = ["GeoName", "Region", "TableName", "LineCode", "Description", "Unit"]
cols_to_drop = redundant_to_drop

# Define naics values to inspect
naics_filter = ["11", "21", "22", "23", "31-33"]
mask = df["naics"].isin(naics_filter)

# Define unknown and globals
fips_filter = ["0", "999$"]
fips_filter = "|".join(fips_filter)

In [None]:
# Process
df = df.drop(cols_to_drop, axis=1)
df = df.dropna(subset=years, how="all")
df = df[~df["FIPS"].astype(str).str.contains(fips_filter)]
df = df.loc[mask]
df = df[df["2022"] != 0]

df = df.sort_values(by=["2022"], ascending=False)

In [None]:
df_group_naics = df.groupby(["naics"])[years].sum()
df_group_fips = df.groupby(["FIPS"])[years].sum()

df_group_naics = df_group_naics.sort_values(by=["2022"], ascending=False)
df_group_fips = df_group_fips.sort_values(by=["2022"], ascending=False)

## Plotting GPD

- [x] Visualize growth
- [ ] Normalize data

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

df_transposed = df_group_naics.T

def gdp_formatter(value, pos):
    """Format millions to a shorter version"""
    return f"{value / 1e6:.1f}M"

# Plotting
df_transposed.plot(kind='line', marker='o', figsize=(10, 6))

plt.gca().yaxis.set_major_formatter(FuncFormatter(gdp_formatter))

# Adding titles and labels
plt.title('GDP of NAICS Codes Over the Years')
plt.xlabel('Year')
plt.ylabel('GDP')
plt.legend(title='NAICS Code')

# Show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

fips_five = df_group_fips.iloc[:10]
df_transposed = fips_five.T

# Plotting
df_transposed.plot(kind='line', marker='o', figsize=(10, 6))

# Convert the Y-axis for readability
plt.gca().yaxis.set_major_formatter(FuncFormatter(gdp_formatter))

# Adding titles and labels
plt.title('GDP of FIPS Codes Over the Years')
plt.xlabel('Year')
plt.ylabel('GDP')
plt.legend(title='FIPS Code')

# Show the plot
plt.show()