# Data Understanding

## Preperation

Import packages and set globals

In [1]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.width", 1000)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)

Import the pattern.pkl file

In [2]:
df = pickle.load(open("../data/raw/naics_pattern.pkl", "rb"))

## Structure

Brief overview of the dataset

### Data Format

In [3]:
df.head(10)

Unnamed: 0,State_GEOID,County_GEOID,FIPS,naics_2,naics,DESCRIPTION,emp_nf,emp,qp1_nf,qp1,ap_nf,ap,est,n<5,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4
0,1,1,1001,11,1133,Logging,G,68,G,1213,G,4563,7,6,N,N,N,N,N,N,N,N,N,N,N,N
1,1,1,1001,21,2123,Nonmetallic Mineral Mining and Quarrying,G,87,G,1224,G,5144,6,N,N,N,N,N,N,N,N,N,N,N,N,N
2,1,1,1001,22,2211,"Electric Power Generation, Transmission and Di...",G,129,G,4418,G,16342,4,N,N,N,N,N,N,N,N,N,N,N,N,N
4,1,1,1001,23,2362,Nonresidential Building Construction,H,69,H,685,H,4184,7,3,N,N,N,N,N,N,N,N,N,N,N,N
5,1,1,1001,23,2371,Utility System Construction,H,65,H,1117,H,4574,4,N,N,N,N,N,N,N,N,N,N,N,N,N
6,1,1,1001,23,2381,"Foundation, Structure, and Building Exterior C...",H,44,G,401,G,1775,12,7,4,N,N,N,N,N,N,N,N,N,N,N
7,1,1,1001,23,2382,Building Equipment Contractors,H,157,G,1514,G,7822,27,15,7,4,N,N,N,N,N,N,N,N,N,N
8,1,1,1001,23,2383,Building Finishing Contractors,H,81,H,753,H,3630,13,7,5,N,N,N,N,N,N,N,N,N,N,N
10,1,1,1001,32,3231,Printing and Related Support Activities,J,33,H,280,H,1334,4,N,N,N,N,N,N,N,N,N,N,N,N,N
11,1,1,1001,42,4238,"Machinery, Equipment, and Supplies Merchant Wh...",J,51,J,867,H,3847,6,3,N,N,N,N,N,N,N,N,N,N,N,N


In [4]:
df.dtypes

State_GEOID     object
County_GEOID    object
FIPS             int64
naics_2         object
naics           object
DESCRIPTION     object
emp_nf          object
emp              int64
qp1_nf          object
qp1              int64
ap_nf           object
ap               int64
est              int64
n<5             object
n5_9            object
n10_19          object
n20_49          object
n50_99          object
n100_249        object
n250_499        object
n500_999        object
n1000           object
n1000_1         object
n1000_2         object
n1000_3         object
n1000_4         object
dtype: object

### Remove unnecessary data for our task

NAICS which don't fit our market:
- not in metal working
- focus on 11, 21, 22, 23, 31-33 

In [5]:
# Remove NAICS as stated
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])
df = df.loc[df["naics"].str.contains(naics_filter)]

### Content Description

In [6]:
df.describe()

Unnamed: 0,FIPS,emp,qp1,ap,est
count,31204.0,31204.0,31204.0,31204.0,31204.0
mean,30429.05,410.71,7234.32,30209.69,22.23
std,15334.79,1307.14,30006.46,118199.59,81.66
min,1001.0,0.0,0.0,19.0,3.0
25%,18059.0,36.0,381.0,1819.75,4.0
50%,30096.0,106.0,1330.0,6144.0,6.0
75%,42105.0,324.0,4660.25,20522.75,14.0
max,56999.0,45246.0,1396549.0,7032755.0,4935.0


In [7]:
df.isna().sum()

State_GEOID     0
County_GEOID    0
FIPS            0
naics_2         0
naics           0
DESCRIPTION     0
emp_nf          0
emp             0
qp1_nf          0
qp1             0
ap_nf           0
ap              0
est             0
n<5             0
n5_9            0
n10_19          0
n20_49          0
n50_99          0
n100_249        0
n250_499        0
n500_999        0
n1000           0
n1000_1         0
n1000_2         0
n1000_3         0
n1000_4         0
dtype: int64

## Data Analysis

Top 10 industries by ***annual pay***

In [8]:
highest_pay = df.groupby(["naics", "DESCRIPTION"])[["ap", "qp1", "emp"]].sum()
highest_ap = highest_pay.sort_values(by="ap", ascending=False).reset_index()

highest_ap.head(10)

Unnamed: 0,naics,DESCRIPTION,ap,qp1,emp
0,2382,Building Equipment Contractors,149468880,33728909,2171001
1,2211,"Electric Power Generation, Transmission and Di...",57228297,17965188,451638
2,2362,Nonresidential Building Construction,56994622,13047402,657152
3,2381,"Foundation, Structure, and Building Exterior C...",54423066,11455421,901696
4,2371,Utility System Construction,46247985,10155569,567730
5,2383,Building Finishing Contractors,45172746,9752111,811812
6,3330A1,"3331, 3332, 3334, 3339",37247397,9289730,488895
7,3345,"Navigational, Measuring, Electromedical, and C...",36803196,9441390,364204
8,3364,Aerospace Product and Parts Manufacturing,35180914,9310549,345099
9,3261,Plastics Product Manufacturing,32199243,7618233,572426


Top 10 industries by ***pay per person***

AP:

In [9]:
highest_pay["ap_per_emp"] = highest_pay["ap"] / highest_pay["emp"]
highest_ap_per_emp = highest_pay.sort_values(by="ap_per_emp", ascending=False).reset_index()

highest_ap_per_emp.head(10)

Unnamed: 0,naics,DESCRIPTION,ap,qp1,emp,ap_per_emp
0,2111,Oil and Gas Extraction,12736937,3864511,84064,151.51
1,3361,Motor Vehicle Manufacturing,10499623,2375538,79173,132.62
2,2211,"Electric Power Generation, Transmission and Di...",57228297,17965188,451638,126.71
3,3241,Petroleum and Coal Products Manufacturing,8453662,2232819,72063,117.31
4,2212,Natural Gas Distribution,7548286,2252540,66339,113.78
5,3364,Aerospace Product and Parts Manufacturing,35180914,9310549,345099,101.94
6,3345,"Navigational, Measuring, Electromedical, and C...",36803196,9441390,364204,101.05
7,3344,Semiconductor and Other Electronic Component M...,21295889,5487947,214664,99.21
8,3250A1,"3251, 3252, 3253, 3259",23082038,5849449,236805,97.47
9,3311,Iron and Steel Mills and Ferroalloy Manufacturing,4242424,1080778,44071,96.26


QP1:

In [10]:

highest_pay["qp1_per_emp"] = highest_pay["qp1"] / highest_pay["emp"]
highest_qp1_per_emp = highest_pay.sort_values(by="qp1_per_emp", ascending=False).reset_index()

highest_qp1_per_emp.head(10)

Unnamed: 0,naics,DESCRIPTION,ap,qp1,emp,ap_per_emp,qp1_per_emp
0,2111,Oil and Gas Extraction,12736937,3864511,84064,151.51,45.97
1,2211,"Electric Power Generation, Transmission and Di...",57228297,17965188,451638,126.71,39.78
2,2212,Natural Gas Distribution,7548286,2252540,66339,113.78,33.95
3,3241,Petroleum and Coal Products Manufacturing,8453662,2232819,72063,117.31,30.98
4,3361,Motor Vehicle Manufacturing,10499623,2375538,79173,132.62,30.0
5,3364,Aerospace Product and Parts Manufacturing,35180914,9310549,345099,101.94,26.98
6,2122,Metal Ore Mining,2777279,778457,29008,95.74,26.84
7,3345,"Navigational, Measuring, Electromedical, and C...",36803196,9441390,364204,101.05,25.92
8,3344,Semiconductor and Other Electronic Component M...,21295889,5487947,214664,99.21,25.57
9,3250A1,"3251, 3252, 3253, 3259",23082038,5849449,236805,97.47,24.7


Top 10 industries by ***employees per establishments***

In [11]:
highest_emp = df.groupby(["naics", "DESCRIPTION"])[["emp", "est"]].sum()
highest_emp["emp_per_est"] = highest_emp["emp"] / highest_emp["est"]
highest_emp = highest_emp.sort_values(by="emp_per_est", ascending=False).reset_index()

highest_emp.head(10)

Unnamed: 0,naics,DESCRIPTION,emp,est,emp_per_est
0,3361,Motor Vehicle Manufacturing,79173,117,676.69
1,3311,Iron and Steel Mills and Ferroalloy Manufacturing,44071,152,289.94
2,3364,Aerospace Product and Parts Manufacturing,345099,1310,263.43
3,2122,Metal Ore Mining,29008,156,185.95
4,3313,Alumina and Aluminum Production and Processing,20149,147,137.07
5,3363,Motor Vehicle Parts Manufacturing,464689,3700,125.59
6,3366,Ship and Boat Building,110876,968,114.54
7,3365,Railroad Rolling Stock Manufacturing,5785,51,113.43
8,3336,"Engine, Turbine, and Power Transmission Equipm...",46179,430,107.39
9,3362,Motor Vehicle Body and Trailer Manufacturing,110704,1175,94.22


## Export

Export top 5 industries into top_industries.pkl

Import top_industries.pkl

In [12]:
top_industries = pd.read_pickle("../data/processed/top_industries.pkl")

Export the top 5 industries into top_industries.pkl

In [13]:
top_industries["by_ap"] = highest_ap["naics"].head(5)
top_industries["by_ap_per_emp"] = highest_ap_per_emp["naics"].head(5)
top_industries["by_qp1_per_emp"] = highest_qp1_per_emp["naics"].head(5)
top_industries["by_emp_per_est"] = highest_emp["naics"].head(5)

In [14]:
pd.to_pickle(top_industries, "../data/processed/top_industries.pkl")