# Data Understanding

## Preperation

Import packages and set globals

In [70]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.width", 1000)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)

Import the occupation.pkl file

In [71]:
df = pd.read_pickle("../data/raw/naics_occupation.pkl")

## Structure

Brief overview of the dataset

### Data Format

In [72]:
df.head(10)

Unnamed: 0,FIPS,State_GEOID,naics,NAICS_TITLE,emp_total_county_naics,OCC_CODE,OCC_TITLE,emp_occupation
0,13073,13,2373,"Highway, Street, and Bridge Construction",27,11-3051,Industrial Production Managers,0.02
1,13073,13,2381,"Foundation, Structure, and Building Exterior C...",231,11-3051,Industrial Production Managers,0.07
2,13073,13,2382,Building Equipment Contractors,868,11-3051,Industrial Production Managers,0.13
3,13073,13,2383,Building Finishing Contractors,281,11-3051,Industrial Production Managers,0.08
5,13073,13,3211,Sawmills and Wood Preservation,116,11-3051,Industrial Production Managers,0.98
6,13073,13,3219,Other Wood Product Manufacturing,27,11-3051,Industrial Production Managers,0.26
7,13073,13,3231,Printing and Related Support Activities,24,11-3051,Industrial Production Managers,0.26
8,13073,13,3327,"Machine Shops; Turned Product; and Screw, Nut,...",18,11-3051,Industrial Production Managers,0.31
9,13073,13,3363,Motor Vehicle Parts Manufacturing,58,11-3051,Industrial Production Managers,0.93
10,13073,13,3370A1,"3371, 3372",108,11-3051,Industrial Production Managers,0.97


In [73]:
df.dtypes

FIPS                        int64
State_GEOID                 int64
naics                      object
NAICS_TITLE                object
emp_total_county_naics      int64
OCC_CODE                   object
OCC_TITLE                  object
emp_occupation            float64
dtype: object

### Remove unnecessary data for our task

NAICS which don't fit our market:
- not in metal working
- focus on 11, 21, 22, 23, 31-33 

In [74]:
# Remove NAICS as stated
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])
df = df.loc[df["naics"].str.contains(naics_filter)]

### Content Description

In [75]:
df.describe()

Unnamed: 0,FIPS,State_GEOID,emp_total_county_naics,emp_occupation
count,711429.0,711429.0,711429.0,711429.0
mean,30455.95,30.36,493.68,4.38
std,15347.5,15.33,1435.48,28.72
min,1001.0,1.0,1.0,0.0
25%,18055.0,18.0,49.0,0.06
50%,31065.0,31.0,139.0,0.29
75%,42101.0,42.0,415.0,1.54
max,56999.0,56.0,45246.0,5323.3


In [76]:
df.isna().sum()

FIPS                      0
State_GEOID               0
naics                     0
NAICS_TITLE               0
emp_total_county_naics    0
OCC_CODE                  0
OCC_TITLE                 0
emp_occupation            0
dtype: int64

## Data Analysis

Top 10 industries by ***employment count***

In [77]:
highest_employment = df.groupby(["naics", "NAICS_TITLE"])["emp_total_county_naics"].sum()
highest_employment = highest_employment.sort_values(ascending=False).reset_index()

highest_employment.head(10)

Unnamed: 0,naics,NAICS_TITLE,emp_total_county_naics
0,2382,Building Equipment Contractors,44530977
1,2381,"Foundation, Structure, and Building Exterior C...",22879267
2,3261,Plastics Product Manufacturing,21951362
3,3330A1,"3331, 3332, 3334, 3339",21783057
4,3363,Motor Vehicle Parts Manufacturing,20955381
5,2383,Building Finishing Contractors,15246977
6,3364,Aerospace Product and Parts Manufacturing,14287086
7,3320A1,"3321, 3322, 3325, 3326, 3329",14116687
8,3320A2,"3323, 3323",13579231
9,2371,Utility System Construction,13207061


## Export

Export top 5 industries into top_industries.pkl

Import top_industries.pkl

In [78]:
top_industries = pd.read_pickle("../data/processed/top_industries.pkl")

Export the top 5 industries into top_industries.pkl

In [79]:
top_industries["by_employees"] = highest_employment["naics"].head(5)

In [80]:
pd.to_pickle(top_industries, "../data/processed/top_industries.pkl")

## Top Jobs

Filter out the NAICS we established as being interesting

In [81]:
naics_filter = ["3311", "3330A1", "3361", "3363", "2381"]
df = df[df["naics"].isin(naics_filter)]

Group by occupation, to find out the highest personal per job

In [82]:
top_occupations = df.groupby(["OCC_CODE", "OCC_TITLE"])["emp_occupation"].sum()
top_occupations = top_occupations.sort_values(ascending=False)

In [83]:
top_occupations.head(10)

OCC_CODE  OCC_TITLE                                                                              
51-4121   Welders, Cutters, Solderers, and Brazers                                                  86414.26
51-4031   Cutting, Punching, and Press Machine Setters, Operators, and Tenders, Metal and Plastic   52591.47
51-4041   Machinists                                                                                44634.91
17-2112   Industrial Engineers                                                                      43348.76
17-2141   Mechanical Engineers                                                                      39079.29
49-9071   Maintenance and Repair Workers, General                                                   38633.93
51-2031   Engine and Other Machine Assemblers                                                       37135.57
49-9041   Industrial Machinery Mechanics                                                            36522.61
47-2221   Structural Iron and 