# Data Understanding

## Preperation

Import packages and set globals

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

%matplotlib inline
plt.rcParams["figure.figsize"] = (20, 6)

In [24]:
gdp_df = pd.read_pickle("../data/raw/gdp.pkl")
occ_df = pd.read_pickle("../data/raw/naics_occupation.pkl")
ptn_df = pd.read_pickle("../data/raw/naics_pattern.pkl")

Based on instruction given to us, we preemptivly drop uninteresting NAICS

In [25]:
naics_filter = "|".join(["^11", "^21", "^22", "^23", "^31", "^32", "^33"])

gdp_df = gdp_df.loc[gdp_df["IndustryClassification"].str.contains(naics_filter)]
occ_df = occ_df.loc[occ_df["naics"].str.contains(naics_filter)]
ptn_df = ptn_df.loc[ptn_df["naics"].str.contains(naics_filter)]

## Structure

Brief overview of the different datasets given to us

### Data Format

In [26]:
gdp_df.head(10)

Unnamed: 0,FIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,2017,2018,2019,2020,2021,2022
2,0,United States,,CAGDP9,3,11,"Agriculture, forestry, fishing and hunting",Thousands of chained 2017 dollars,176840000.0,184105000.0,171149000.0,173659000.0,183742000.0,170058000.0
3,0,United States,,CAGDP9,6,21,"Mining, quarrying, and oil and gas extraction",Thousands of chained 2017 dollars,267302000.0,277013000.0,314604000.0,305588000.0,269478000.0,245290000.0
4,0,United States,,CAGDP9,10,22,Utilities,Thousands of chained 2017 dollars,313711000.0,309269000.0,312920000.0,332336000.0,316603000.0,318392000.0
5,0,United States,,CAGDP9,11,23,Construction,Thousands of chained 2017 dollars,840220000.0,863755000.0,882046000.0,856487000.0,888104000.0,827768000.0
6,0,United States,,CAGDP9,12,31-33,Manufacturing,Thousands of chained 2017 dollars,2109718000.0,2213031000.0,2223326000.0,2127060000.0,2248052000.0,2277757000.0
7,0,United States,,CAGDP9,13,"321,327-339",Durable goods manufacturing,Thousands of chained 2017 dollars,1178271000.0,1227629000.0,1241367000.0,1171720000.0,1249509000.0,1296970000.0
8,0,United States,,CAGDP9,25,"311-316,322-326",Nondurable goods manufacturing,Thousands of chained 2017 dollars,931447000.0,985229000.0,981959000.0,955683000.0,999648000.0,985406000.0
28,0,United States,,CAGDP9,87,1121,Natural resources and mining,Thousands of chained 2017 dollars,444143000.0,460949703.0,487685301.0,483389355.0,455053638.0,416036526.0
30,0,United States,,CAGDP9,89,"22,48-49",Transportation and utilities,Thousands of chained 2017 dollars,949174000.0,966855752.0,971436131.0,936020785.0,1000941475.0,1016052876.0
31,0,United States,,CAGDP9,90,"31-33,51",Manufacturing and information,Thousands of chained 2017 dollars,3119738000.0,3278878278.0,3396542443.0,3342047889.0,3639162084.0,3758807082.0


In [27]:
occ_df.head(10)

Unnamed: 0,FIPS,State_GEOID,naics,NAICS_TITLE,emp_total_county_naics,OCC_CODE,OCC_TITLE,emp_occupation
0,13073,13,2373,"Highway, Street, and Bridge Construction",27,11-3051,Industrial Production Managers,0.02
1,13073,13,2381,"Foundation, Structure, and Building Exterior C...",231,11-3051,Industrial Production Managers,0.07
2,13073,13,2382,Building Equipment Contractors,868,11-3051,Industrial Production Managers,0.13
3,13073,13,2383,Building Finishing Contractors,281,11-3051,Industrial Production Managers,0.08
5,13073,13,3211,Sawmills and Wood Preservation,116,11-3051,Industrial Production Managers,0.98
6,13073,13,3219,Other Wood Product Manufacturing,27,11-3051,Industrial Production Managers,0.26
7,13073,13,3231,Printing and Related Support Activities,24,11-3051,Industrial Production Managers,0.26
8,13073,13,3327,"Machine Shops; Turned Product; and Screw, Nut,...",18,11-3051,Industrial Production Managers,0.31
9,13073,13,3363,Motor Vehicle Parts Manufacturing,58,11-3051,Industrial Production Managers,0.93
10,13073,13,3370A1,"3371, 3372",108,11-3051,Industrial Production Managers,0.97


In [28]:
ptn_df.head(10)

Unnamed: 0,State_GEOID,County_GEOID,FIPS,naics_2,naics,DESCRIPTION,emp_nf,emp,qp1_nf,qp1,ap_nf,ap,est,n<5,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,n1000_1,n1000_2,n1000_3,n1000_4
0,1,1,1001,11,1133,Logging,G,68,G,1213,G,4563,7,6,N,N,N,N,N,N,N,N,N,N,N,N
1,1,1,1001,21,2123,Nonmetallic Mineral Mining and Quarrying,G,87,G,1224,G,5144,6,N,N,N,N,N,N,N,N,N,N,N,N,N
2,1,1,1001,22,2211,"Electric Power Generation, Transmission and Di...",G,129,G,4418,G,16342,4,N,N,N,N,N,N,N,N,N,N,N,N,N
4,1,1,1001,23,2362,Nonresidential Building Construction,H,69,H,685,H,4184,7,3,N,N,N,N,N,N,N,N,N,N,N,N
5,1,1,1001,23,2371,Utility System Construction,H,65,H,1117,H,4574,4,N,N,N,N,N,N,N,N,N,N,N,N,N
6,1,1,1001,23,2381,"Foundation, Structure, and Building Exterior C...",H,44,G,401,G,1775,12,7,4,N,N,N,N,N,N,N,N,N,N,N
7,1,1,1001,23,2382,Building Equipment Contractors,H,157,G,1514,G,7822,27,15,7,4,N,N,N,N,N,N,N,N,N,N
8,1,1,1001,23,2383,Building Finishing Contractors,H,81,H,753,H,3630,13,7,5,N,N,N,N,N,N,N,N,N,N,N
10,1,1,1001,32,3231,Printing and Related Support Activities,J,33,H,280,H,1334,4,N,N,N,N,N,N,N,N,N,N,N,N,N
54,1,3,1003,21,2123,Nonmetallic Mineral Mining and Quarrying,J,29,J,306,J,1291,3,N,N,N,N,N,N,N,N,N,N,N,N,N


### Info

In [29]:
gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31780 entries, 2 to 108049
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   FIPS                    31780 non-null  int64  
 1   GeoName                 31780 non-null  object 
 2   Region                  31780 non-null  object 
 3   TableName               31780 non-null  object 
 4   LineCode                31780 non-null  int64  
 5   IndustryClassification  31780 non-null  object 
 6   Description             31780 non-null  object 
 7   Unit                    31780 non-null  object 
 8   2017                    26563 non-null  float64
 9   2018                    26499 non-null  float64
 10  2019                    26259 non-null  float64
 11  2020                    26472 non-null  float64
 12  2021                    26580 non-null  float64
 13  2022                    27125 non-null  float64
dtypes: float64(6), int64(2), object(6)
memory 

In [30]:
occ_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 711429 entries, 0 to 1820150
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   FIPS                    711429 non-null  int64  
 1   State_GEOID             711429 non-null  int64  
 2   naics                   711429 non-null  object 
 3   NAICS_TITLE             711429 non-null  object 
 4   emp_total_county_naics  711429 non-null  int64  
 5   OCC_CODE                711429 non-null  object 
 6   OCC_TITLE               711429 non-null  object 
 7   emp_occupation          711429 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 48.8+ MB


In [31]:
ptn_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31204 entries, 0 to 188572
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   State_GEOID   31204 non-null  object
 1   County_GEOID  31204 non-null  object
 2   FIPS          31204 non-null  int64 
 3   naics_2       31204 non-null  object
 4   naics         31204 non-null  object
 5   DESCRIPTION   31204 non-null  object
 6   emp_nf        31204 non-null  object
 7   emp           31204 non-null  int64 
 8   qp1_nf        31204 non-null  object
 9   qp1           31204 non-null  int64 
 10  ap_nf         31204 non-null  object
 11  ap            31204 non-null  int64 
 12  est           31204 non-null  int64 
 13  n<5           31204 non-null  object
 14  n5_9          31204 non-null  object
 15  n10_19        31204 non-null  object
 16  n20_49        31204 non-null  object
 17  n50_99        31204 non-null  object
 18  n100_249      31204 non-null  object
 19  n250_499

## Analysis

### Top Industries



In [32]:
top_industries = pd.DataFrame()

#### GDP

By 2022 gdp

In [33]:
highest_gdp = gdp_df.groupby(["IndustryClassification", "Description"])[["2017", "2018", "2019", "2020", "2021", "2022"]].sum().reset_index()
top_industries["2022_gdp"] = highest_gdp.sort_values(by="2022", ascending=False).reset_index()["IndustryClassification"]

Mean gdp from 2017 - 2022

In [34]:
highest_gdp['mean_gdp'] = highest_gdp[["2017", "2018", "2019", "2020", "2021", "2022"]].mean(axis=1)
top_industries["mean_gdp"] = highest_gdp.sort_values(by="mean_gdp", ascending=False).reset_index()["IndustryClassification"]

#### OCC

By employment

In [35]:
highest_employment = occ_df.groupby(["naics", "NAICS_TITLE"])["emp_total_county_naics"].sum()
top_industries["employment"] = highest_employment.sort_values(ascending=False).reset_index()["naics"]

#### Pattern

By annual pay

In [36]:
highest_pay = ptn_df.groupby(["naics", "DESCRIPTION"])[["ap", "qp1", "emp"]].sum()
top_industries["ap"] = highest_pay.sort_values(by="ap", ascending=False).reset_index()["naics"]

By pay per person

In [37]:
highest_pay["ap_per_emp"] = highest_pay["ap"] / highest_pay["emp"]
top_industries["ap_per_emp"] = highest_pay.sort_values(by="ap_per_emp", ascending=False).reset_index()["naics"]

By establishments

In [38]:
highest_est = ptn_df.groupby(["naics", "DESCRIPTION"])[["emp", "est"]].sum()
highest_est["emp_per_est"] = highest_est["emp"] / highest_est["est"]

top_industries["establishments"] = highest_est.sort_values(by="est", ascending=False).reset_index()["naics"]
top_industries["emp_per_establishments"] = highest_est.sort_values(by="emp_per_est", ascending=False).reset_index()["naics"]

#### Overview

In [39]:
top_industries.head(10)

Unnamed: 0,2022_gdp,mean_gdp,employment,ap,ap_per_emp,establishments,emp_per_establishments
0,"31-33,51","31-33,51",2382,2382,2111,2382,3361
1,31-33,31-33,2381,2211,3361,2383,3311
2,"321,327-339","321,327-339",3261,2362,2211,2381,3364
3,"22,48-49","311-316,322-326",3330A1,2381,3241,2362,2122
4,"311-316,322-326","22,48-49",3363,2371,2212,3231,3313
5,23,23,2383,2383,3364,3327,3363
6,1121,1121,3364,3330A1,3345,2371,3366
7,22,22,3320A1,3345,3344,3399,3365
8,21,21,3320A2,3364,3250A1,3320A2,3336
9,11,11,2371,3261,3311,3370A1,3362


The most interesting branch seems to be ***Durable Good Manufacturing*** as it has the highest GDP, well paid workers and a smaller amount of establishments.
This should lead to easier marketing for premium metalworking tools.

While we could also include the ***Construction*** Branch, they seem less interesting, as they have a lower gdp, and less employees per establishments. Which leads to less volume sold per customer.

Picked NAICS:

1. 3361
2. 3311
3. 3364
4. 3363
5. 3330A1

### Top Occupations

In [40]:
top_occupations = pd.DataFrame()

top_industries = ["3361", "3311", "3364", "3363" "3330A1"]
occ_df_filtered = occ_df[occ_df["naics"].isin(top_industries)]

By employees

In [41]:
top_occupations = occ_df_filtered.groupby(["OCC_CODE", "OCC_TITLE"])["emp_occupation"].sum()
top_occupations = top_occupations.sort_values(ascending=False)

top_occupations.head(10)

OCC_CODE  OCC_TITLE                                                                              
51-4121   Welders, Cutters, Solderers, and Brazers                                                  71896.12
17-2112   Industrial Engineers                                                                      61982.95
51-4041   Machinists                                                                                59657.92
51-4031   Cutting, Punching, and Press Machine Setters, Operators, and Tenders, Metal and Plastic   54575.40
17-2141   Mechanical Engineers                                                                      48488.21
49-9041   Industrial Machinery Mechanics                                                            40033.37
51-2031   Engine and Other Machine Assemblers                                                       39136.84
49-9071   Maintenance and Repair Workers, General                                                   38513.70
51-4081   Multiple Machine Too

Occupations that peak our interest are the ones, that would directly use our products (premium tools for metalworking)

1. 51-4121 Welders, Cutters, Solderers, and Brazers
2. 51-4031 Cutting, Punching, and Press Machine Setters ...
3. 51-4041 Machinists
4. 49-9071 Maintenance and Repair Workers, General
5. 51-2031 Engine and Other Machine Assemblers

In [42]:
top_occupations = ["51-4121", "51-4031", "51-4041", "49-9071", "51-2031"]

## Export

In [43]:
top_picks = pd.DataFrame()

top_picks["naics"] = top_industries
top_picks["occ"] = top_occupations

pd.to_pickle(top_picks, "../data/processed/top_picks.pkl")

Unnamed: 0,naics,occ
0,3311,51-4121
1,3330A1,51-4031
2,3361,51-4041
3,3363,49-9071
4,3364,51-2031
