# Data Understanding

## Preperation

Import packages and set globals

In [1]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.width", 1000)
pd.set_option('display.max_columns', None)

%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)

Import the occupation.pkl file

In [2]:
df = pickle.load(open("../data/raw/naics_occupation.pkl", "rb"))

## Structure

Brief overview of the dataset

### Data Format

In [3]:
df.head(10)

Unnamed: 0,FIPS,State_GEOID,naics,NAICS_TITLE,emp_total_county_naics,OCC_CODE,OCC_TITLE,emp_occupation
0,13073,13,2373,"Highway, Street, and Bridge Construction",27,11-3051,Industrial Production Managers,0.022497
1,13073,13,2381,"Foundation, Structure, and Building Exterior C...",231,11-3051,Industrial Production Managers,0.06749
2,13073,13,2382,Building Equipment Contractors,868,11-3051,Industrial Production Managers,0.134981
3,13073,13,2383,Building Finishing Contractors,281,11-3051,Industrial Production Managers,0.078739
5,13073,13,3211,Sawmills and Wood Preservation,116,11-3051,Industrial Production Managers,0.978609
6,13073,13,3219,Other Wood Product Manufacturing,27,11-3051,Industrial Production Managers,0.258713
7,13073,13,3231,Printing and Related Support Activities,24,11-3051,Industrial Production Managers,0.258713
8,13073,13,3327,"Machine Shops; Turned Product; and Screw, Nut,...",18,11-3051,Industrial Production Managers,0.314955
9,13073,13,3363,Motor Vehicle Parts Manufacturing,58,11-3051,Industrial Production Managers,0.933616
10,13073,13,3370A1,"3371, 3372",108,11-3051,Industrial Production Managers,0.967361


In [4]:
df.dtypes

FIPS                        int64
State_GEOID                 int64
naics                      object
NAICS_TITLE                object
emp_total_county_naics      int64
OCC_CODE                   object
OCC_TITLE                  object
emp_occupation            float64
dtype: object

### Remove unnecessary data for our task

NAICS which don't fit our market:
- not in metal working
- focus on 11, 21, 22, 23, 31-33 

In [5]:
# Remove NAICS as stated
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])
df = df.loc[df["naics"].str.contains(naics_filter)]

KeyError: 'IndustryClassification'

### Content Description

In [None]:
df.describe()

Unnamed: 0,FIPS,State_GEOID,emp_total_county_naics,emp_occupation
count,1108336.0,1108336.0,1108336.0,1108336.0
mean,30439.56,30.33609,795.4656,4.883497
std,15314.05,15.29425,12172.43,49.37105
min,1001.0,1.0,1.0,0.0006261349
25%,18063.0,18.0,47.0,0.05300153
50%,30111.0,30.0,134.0,0.2462883
75%,42111.0,42.0,418.0,1.356463
max,56999.0,56.0,1436559.0,26192.83


In [None]:
df.isna().sum()

FIPS                      0
State_GEOID               0
naics                     0
NAICS_TITLE               0
emp_total_county_naics    0
OCC_CODE                  0
OCC_TITLE                 0
emp_occupation            0
dtype: int64

## Data Analysis

Top 10 industries by employment count

In [None]:
highest_employment = df.groupby(["naics", "NAICS_TITLE"])["emp_total_county_naics"].sum()
highest_employment = highest_employment.sort_values(ascending=False)

highest_employment.head(10)

naics   NAICS_TITLE                                             
5613    Employment Services                                         366026816
5413    Architectural, Engineering, and Related Services             48325842
2382    Building Equipment Contractors                               44530977
4238    Machinery, Equipment, and Supplies Merchant Wholesalers      26752088
2381    Foundation, Structure, and Building Exterior Contractors     22879267
3261    Plastics Product Manufacturing                               21951362
3330A1  3331, 3332, 3334, 3339                                       21783057
3363    Motor Vehicle Parts Manufacturing                            20955381
5617    Services to Buildings and Dwellings                          20539859
8111    Automotive Repair and Maintenance                            17402518
Name: emp_total_county_naics, dtype: int64