<a href="https://colab.research.google.com/github/cfcastillo/DS-6-Notebooks/blob/main/3_Education_Capstone_Data_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Visualization

# Project Definition

The purpose of this project is to identify what factors influence people to choose certain professions or trades. In understanding these factors, we can help colleges like Central New Mexico College (CNM) offer courses that support those professions and better target their marketing to people who are likely to choose those professions.

This project will be a supervised categorization problem using tree-based models to identify the factors that will contribute to career choice.

# Data Collection and Cleaning

The data collection and cleaning process is outlined in the notebook titled [1. Education Capstone - Data Collection and Cleaning.ipynb](https://colab.research.google.com/drive/1Y_1b7BmiRF6CSYnoiZqGpfjpbzU4qoFe#scrollTo=Kmxlgo4Wnjgd)


## Column Descriptions

[Here is a summary document showing selected columns.](https://docs.google.com/document/d/1io7TtqebJLtw6FKE7zkbUh26QkG3rEJrZX3Fver9zmU/edit)

# Exploratory Data Analysis (EDA)

EDA can be found in the notebook titled [2. Education Capstone - EDA and Processing.ipynb](https://colab.research.google.com/drive/1Fa18G_kZY8fCEKupjsfICRyeav7dEw7K)

# Data Processing / Models

Data Processing and Model application can be found in the notebook titled [2. Education Capstone - EDA and Processing.ipynb](https://colab.research.google.com/drive/1Fa18G_kZY8fCEKupjsfICRyeav7dEw7K)

# Imports

In [None]:
# grab the imports needed for the project
import pandas as pd
# import glob
# import matplotlib.pyplot as plt
import numpy as np
# import seaborn as sns
# import statsmodels.api as sm

# all
# from sklearn import datasets
# from sklearn import metrics
# from sklearn import preprocessing
# from sklearn import datasets, neighbors
# from sklearn.metrics import classification_report
# from sklearn.neighbors import NearestNeighbors
# import sklearn.model_selection as model_selection
from collections import Counter
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import RandomOverSampler
# from mlxtend.plotting import plot_decision_regions

# Regression
# from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor 
# from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import r2_score
# from sklearn.metrics import accuracy_score
# from sklearn import metrics  

# Visualization
# import graphviz
# from IPython.display import display
# from sklearn import tree
import plotly.express as px
from ipywidgets import interact, Dropdown, interact_manual
import plotly.graph_objs as go

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Globals
The team had different data links. The global here is to allow team members to specify who is working on this notebook so that they can run the code in their environment.

In [None]:
# Expected values are: ellie, amy, cecilia - lowercase
team_member = 'ellie'

# Root drive path
if team_member in ['amy','ellie']:
  root_drive = '/content/drive/MyDrive/'
else: # Cecilia
  root_drive = '/content/drive/MyDrive/Student Folder - Cecilia/Projects/'

In [None]:
#Import df with all years
df_viz = pd.read_csv(root_drive + 'Capstone/Data/FinalData/Trends/asec_trend_v2.csv')

##Cleaning

In [None]:
# filter data to features of interest
keep_cols = ['A_DTOCC','GEDIV','GESTFIPS','HHINC','H_TENURE','A_SEX','PRDTRACE','PRCITSHP','A_HGA','A_GRSWK','HRSWK','PEARNVAL','A_CLSWKR','A_MARITL','HEA','PEINUSYR', 'DATA_YEAR']
df_viz = df_viz[keep_cols].copy()

# drop specified class of worker
wkr = [0,8]
df_viz.drop(df_viz[df_viz.A_CLSWKR.isin(wkr)].index, inplace=True)

# Drop rows where A_DTOCC = 0 = Not in universe or Armed Forces
df_viz.drop(np.where(df_viz['A_DTOCC'] == 0) [0], inplace=True)

#create dictionary to convert A_DTOCC codes to string descriptions
occ_dict = {1: 'Management',
            2: 'Business & Financial Operations',
            3: 'Computer & Mathematical Science',
            4: 'Architecture & Engineering',
            5: 'Life, Physical, & Social Science',
            6: 'Community & Social Service',
            7: 'Legal',
            8: 'Education, Training, & Library',
            9: 'Arts, Design, Entertainment, Sports, & Media',
            10: 'Healthcare Practitioner & Technical',
            11: 'Healtcare Support',
            12: 'Protective Service',
            13: 'Food Preparation & Serving Related',
            14: 'Building & Grounds Cleaning & Maintenance',
            15: 'Personal Care & Service',
            16: 'Sales & Related',
            17: 'Office & Administrative Support',
            18: 'Farming, Fishing, & Forestry',
            19: 'Construction & Extraction',
            20: 'Installation, Maintenance, & Repair',
            21: 'Production',
            22: 'Transportation & Material Moving',
            23: 'Armed Forces'}

#add column to df_viz with A_DTOCC codes converted to string descriptions
df_viz['occ_string'] = df_viz['A_DTOCC'].apply(lambda x: occ_dict.get(x, occ_dict.values))

# drop rows which were not highly populated (determined earlier with 3000 threshold)
df_viz = df_viz.query('A_DTOCC not in [3,4,5,6,7,9,11,12,18,20,23]')

#Import state codes
df_states = pd.read_csv(root_drive + 'Capstone/Data/Codes/FIPS_STATE_CODES.csv')

#Combine dataframes to get state names
df_viz = pd.merge(df_viz, df_states, how='left', left_on='GESTFIPS', right_on='FIPS_STATE')

Nationally, what are the most popular occupation categories from years 2012 to 2021?

In [None]:
#@title
state_o = df_viz['USPS_STATE'].unique()
state_o_s = Dropdown(options = sorted(state_o))

year_o = df_viz['DATA_YEAR'].unique()
year_o_s = Dropdown(options = sorted(year_o)) 

@interact(Year = year_o_s, State = state_o_s)
def pie(Year=2012, State = 'NM'):
  'Makes pie plot with given year and state'
  df_viz_year = df_viz[(df_viz['DATA_YEAR'] == Year) & (df_viz['USPS_STATE'] == State)]
  keys = Counter(df_viz_year['occ_string']).keys()
  list_keys= list(keys)
  list_keys.sort()
  fig = px.pie(df_viz_year, values=df_viz_year['occ_string'].value_counts().sort_index(), names=list_keys, color_discrete_map={
    'Management': 'Dark24[0]',
    'Business & Financial Operations': '#E15F99',
    'Computer & Mathematical Science': '#1CA71C',
    'Architecture & Engineering': '#FB0D0D',
    'Life, Physical, & Social Science': '#DA16FF',
    'Community & Social Service': '#222A2A',
    'Legal': '#B68100',
    'Education, Training, & Library': '#750D86',
    'Arts, Design, Entertainment, Sports, & Media': '#EB663B',
    'Healthcare Practitioner & Technical': '#511CFB',
    'Healtcare Support': '#00A08B',
    'Protective Service': '#FB00D1',
    'Food Preparation & Serving Related': '#FC0080',
    'Building & Grounds Cleaning & Maintenance': '#B2828D',
    'Personal Care & Service': '#6C7C32',
    'Sales & Related': '#778AAE',
    'Office & Administrative Support': '#862A16',
    'Farming, Fishing, & Forestry': '#A777F1',
    'Construction & Extraction': '#620042',
    'Installation, Maintenance, & Repair': '#1616A7',
    'Production': '#DA60CA',
    'Transportation & Material Moving': '#6C4516',
    'Armed Forces': '#0D2A63'},
    title=f'Occupations in the US by Year and State: {Year}, {State}')
  return fig.show()

#Changed interact to interact_manual but did not solve issue of plot shrinking with re-run. 

interactive(children=(Dropdown(description='Year', options=(2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 20…

In [None]:
#@title
# create subset of visualization dataframe with columns of interest
subset_df = df_viz[['DATA_YEAR', 'occ_string']].copy()
 
# all occupation counts
all_counts_df = subset_df.groupby('DATA_YEAR').count()
all_counts_df.set_axis(['All Counts'], axis=1, inplace=True)
all_counts_df['Data Year'] = all_counts_df.index

results = []

# loop through the occupation codes to add to merged dataframe
for value in subset_df['occ_string'].unique():
  occ_counts_df = subset_df[subset_df['occ_string'] == value].groupby('DATA_YEAR').count()
  occ_counts_df.set_axis([value], axis=1, inplace=True)
  combo_df = pd.merge(all_counts_df,occ_counts_df, on = 'DATA_YEAR')
  combo_df[f'{value} Percent'] = combo_df[value]/combo_df['All Counts'] * 100
  results.append(combo_df)

final_df = pd.concat(results, axis=1, join= 'inner')

#drop columns not needed
drop_idx_allcounts = list(range(4, final_df.shape[1], 4))
drop_idx_datayear = list(range(5, final_df.shape[1], 4))
drop_idx = drop_idx_allcounts + drop_idx_datayear
final_df.drop(final_df.columns[drop_idx], axis=1, inplace=True)

#add back in one Data Year column
final_df['Data Year'] = final_df.index

#columns to plot
columns_plot = list(range(1, final_df.shape[1], 2))

fig = go.Figure()
for col in final_df.columns[columns_plot]:
  fig.add_trace(go.Line(x=final_df.index, y=final_df[col], name=col))
fig.update_layout(title='National Trends in Occupation', xaxis_title='Year', yaxis_title = 'Occupation as Percent')
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [None]:
#@title
@interact(State = state_o_s)
def line_state(State = 'NM'):
  subset_df = df_viz[['DATA_YEAR', 'occ_string', 'USPS_STATE']].copy()
  subset_df = subset_df[subset_df['USPS_STATE'] == State]
  subset_df.drop(['USPS_STATE'], axis=1, inplace=True)
  
  all_counts_df = subset_df.groupby('DATA_YEAR').count()
  all_counts_df.set_axis(['All Counts'], axis=1, inplace=True)
  all_counts_df['Data Year'] = all_counts_df.index
  
  results = []
  for value in subset_df['occ_string'].unique():
    occ_counts_df = subset_df[subset_df['occ_string'] == value].groupby('DATA_YEAR').count()
    occ_counts_df.set_axis([value], axis=1, inplace=True)
    combo_df = pd.merge(all_counts_df,occ_counts_df, on = 'DATA_YEAR')
    combo_df[f'{value} Percent'] = combo_df[value]/combo_df['All Counts'] * 100
    results.append(combo_df)

  final_df = pd.concat(results, axis=1, join= 'inner')

  drop_idx_allcounts = list(range(4, final_df.shape[1], 4))
  drop_idx_datayear = list(range(5, final_df.shape[1], 4))
  drop_idx = drop_idx_allcounts + drop_idx_datayear
  final_df.drop(final_df.columns[drop_idx], axis=1, inplace=True)

  final_df['Data Year'] = final_df.index

  columns_plot = list(range(1, final_df.shape[1], 2))

  fig = go.Figure()
  for col in final_df.columns[columns_plot]:
    fig.add_trace(go.Line(x=final_df.index, y=final_df[col], name=col))
  fig.update_layout(title='State Trends in Occupation', xaxis_title='Year', yaxis_title = 'Occupation as Percent')
  return fig.show()

interactive(children=(Dropdown(description='State', index=32, options=('AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT…