# Data Understanding

In this notebook we try to gain a deeper understanding of the dataset given to us.
It should contain values that represent the money generated by the corresponding industry
in a specified county.

## Preperation

Import packages and set globals

In [59]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.width", 1000)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)

Importing the gdp.pkl file

In [60]:
df = pickle.load(open("../data/raw/gdp.pkl", "rb"))

## Structure

Brief overview of the dataset

### Data Format

In [61]:
df.head(10)

Unnamed: 0,FIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,2017,2018,2019,2020,2021,2022
0,0,United States,,CAGDP9,1,...,All industry total,Thousands of chained 2017 dollars,19612102000.0,20193896000.0,20692087000.0,20234074000.0,21407692000.0,21822037000.0
1,0,United States,,CAGDP9,2,...,Private industries,Thousands of chained 2017 dollars,17156255000.0,17711775000.0,18195752000.0,17761871000.0,18909453000.0,19283116000.0
2,0,United States,,CAGDP9,3,11,"Agriculture, forestry, fishing and hunting",Thousands of chained 2017 dollars,176840000.0,184105000.0,171149000.0,173659000.0,183742000.0,170058000.0
3,0,United States,,CAGDP9,6,21,"Mining, quarrying, and oil and gas extraction",Thousands of chained 2017 dollars,267302000.0,277013000.0,314604000.0,305588000.0,269478000.0,245290000.0
4,0,United States,,CAGDP9,10,22,Utilities,Thousands of chained 2017 dollars,313711000.0,309269000.0,312920000.0,332336000.0,316603000.0,318392000.0
5,0,United States,,CAGDP9,11,23,Construction,Thousands of chained 2017 dollars,840220000.0,863755000.0,882046000.0,856487000.0,888104000.0,827768000.0
6,0,United States,,CAGDP9,12,31-33,Manufacturing,Thousands of chained 2017 dollars,2109718000.0,2213031000.0,2223326000.0,2127060000.0,2248052000.0,2277757000.0
7,0,United States,,CAGDP9,13,"321,327-339",Durable goods manufacturing,Thousands of chained 2017 dollars,1178271000.0,1227629000.0,1241367000.0,1171720000.0,1249509000.0,1296970000.0
8,0,United States,,CAGDP9,25,"311-316,322-326",Nondurable goods manufacturing,Thousands of chained 2017 dollars,931447000.0,985229000.0,981959000.0,955683000.0,999648000.0,985406000.0
9,0,United States,,CAGDP9,34,42,Wholesale trade,Thousands of chained 2017 dollars,1176146000.0,1185896000.0,1191436000.0,1200151000.0,1192824000.0,1146825000.0


In [62]:
df.dtypes

FIPS                        int64
GeoName                    object
Region                     object
TableName                  object
LineCode                    int64
IndustryClassification     object
Description                object
Unit                       object
2017                      float64
2018                      float64
2019                      float64
2020                      float64
2021                      float64
2022                      float64
dtype: object

### Remove unnecessary data for our task

"Over-aggregated" records via FIPS:
- US-wide GDPs
- FIPS totals
- Private totals

NAICS which don't fit our market:
- not in metal working
- focus on 11, 21, 22, 23, 31-33

In [63]:
# Remove FIPS as stated
df = df[df["FIPS"] != 0]
df = df[df["IndustryClassification"] != "..."]

# Remove NAICS as stated
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])
df = df.loc[df["IndustryClassification"].str.contains(naics_filter)]

### Content Description

In [64]:
df.describe()

Unnamed: 0,FIPS,LineCode,2017,2018,2019,2020,2021,2022
count,31770.0,31770.0,26553.0,26489.0,26249.0,26462.0,26570.0,27115.0
mean,30301.32,34.6,1150981.64,1201118.54,1237101.28,1194253.41,1253288.36,1240593.52
std,15461.83,35.81,12656432.93,13303683.52,13785083.66,13426488.02,14467432.22,14575318.89
min,1000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18157.0,10.0,9377.0,9572.0,9810.0,10062.25,11193.75,10048.0
50%,29151.0,12.5,45309.0,46404.0,47052.0,47401.5,52661.0,49690.0
75%,45043.0,87.0,207074.0,214071.0,216414.0,208956.5,215579.25,213971.0
max,98000.0,90.0,746914680.0,797666904.0,859510532.0,877152423.0,986444700.0,1010180851.0


In [65]:
df.isnull().sum()

FIPS                         0
GeoName                      0
Region                       0
TableName                    0
LineCode                     0
IndustryClassification       0
Description                  0
Unit                         0
2017                      5217
2018                      5281
2019                      5521
2020                      5308
2021                      5200
2022                      4655
dtype: int64

## Data Analysis

Top 10 industries by generated value in ***2022***

In [66]:
top_industries = df.groupby(["IndustryClassification", "Description"])[["2017", "2018", "2019", "2020", "2021", "2022"]].sum().reset_index()
top_industries = top_industries.sort_values(by="2022", ascending=False)

top_industries.head(10)

Unnamed: 0,IndustryClassification,Description,2017,2018,2019,2020,2021,2022
7,"31-33,51",Manufacturing and information,9280243920.0,9735987237.0,10083971728.0,9947999865.0,10806349751.0,11233345677.0
6,31-33,Manufacturing,6303678118.0,6607936635.0,6642889526.0,6353740767.0,6726617219.0,6809586832.0
9,"321,327-339",Durable goods manufacturing,3528738656.0,3675680699.0,3718305615.0,3512050183.0,3746374584.0,3892390541.0
8,"311-316,322-326",Nondurable goods manufacturing,2786092512.0,2945577109.0,2933192468.0,2850567220.0,2989384313.0,2940975130.0
4,"22,48-49",Transportation and utilities,2663723921.0,2691656290.0,2721750645.0,2620685496.0,2813822670.0,2933333511.0
5,23,Construction,2504011752.0,2575377369.0,2632920205.0,2556998503.0,2651922125.0,2471916270.0
1,1121,Natural resources and mining,1275530786.0,1329960208.0,1399298392.0,1384137916.0,1312505437.0,1200288749.0
3,22,Utilities,913465796.0,898899015.0,909249001.0,962765669.0,921435679.0,939502674.0
2,21,"Mining, quarrying, and oil and gas extraction",801886963.0,830967240.0,943847461.0,917071685.0,780216143.0,707038921.0
0,11,"Agriculture, forestry, fishing and hunting",504642996.0,524387312.0,487246501.0,496316353.0,551243827.0,510315065.0


In [67]:
top_industries['mean_gdp'] = top_industries[["2017", "2018", "2019", "2020", "2021", "2022"]].mean(axis=1)
top_industries = top_industries.sort_values(by="mean_gdp", ascending=False)

top_industries.head(10)

Unnamed: 0,IndustryClassification,Description,2017,2018,2019,2020,2021,2022,mean_gdp
7,"31-33,51",Manufacturing and information,9280243920.0,9735987237.0,10083971728.0,9947999865.0,10806349751.0,11233345677.0,10181316363.0
6,31-33,Manufacturing,6303678118.0,6607936635.0,6642889526.0,6353740767.0,6726617219.0,6809586832.0,6574074849.5
9,"321,327-339",Durable goods manufacturing,3528738656.0,3675680699.0,3718305615.0,3512050183.0,3746374584.0,3892390541.0,3678923379.67
8,"311-316,322-326",Nondurable goods manufacturing,2786092512.0,2945577109.0,2933192468.0,2850567220.0,2989384313.0,2940975130.0,2907631458.67
4,"22,48-49",Transportation and utilities,2663723921.0,2691656290.0,2721750645.0,2620685496.0,2813822670.0,2933333511.0,2740828755.5
5,23,Construction,2504011752.0,2575377369.0,2632920205.0,2556998503.0,2651922125.0,2471916270.0,2565524370.67
1,1121,Natural resources and mining,1275530786.0,1329960208.0,1399298392.0,1384137916.0,1312505437.0,1200288749.0,1316953581.33
3,22,Utilities,913465796.0,898899015.0,909249001.0,962765669.0,921435679.0,939502674.0,924219639.0
2,21,"Mining, quarrying, and oil and gas extraction",801886963.0,830967240.0,943847461.0,917071685.0,780216143.0,707038921.0,830171402.17
0,11,"Agriculture, forestry, fishing and hunting",504642996.0,524387312.0,487246501.0,496316353.0,551243827.0,510315065.0,512358675.67
