In [86]:
import pandas as pd

data = pd.read_table('../data/onet_data.txt')

In [87]:
def intround(n: int, sigfigs: int) -> int:
    n = str(n)
    return n[:sigfigs] + ('0' * (len(n)-(sigfigs)))


def adjust_data(data: pd.DataFrame) -> pd.DataFrame:
    data = data.drop('Shown in My Next Move', axis=1)
    data = data.rename(columns={'O*NET-SOC Code':'soc_code', 'Reported Job Title': 'job_title'})
    
    # break down soc codes to usefull pieces
    # https://www.bls.gov/soc/2018/soc_2018_class_and_coding_structure.pdf
    data['soc_code_split'] = data['soc_code'].str.split('-')
    data['major_group'] = data['soc_code_split'].apply(lambda x: int(x[0]))
    data['occ_number'] = data['soc_code_split'].apply(lambda x: int(float(x[1])))
    data['minor_group'] = data['soc_code_split'].apply(lambda x: intround(int(float(x[1])), 1))
    
    # mapping provided from above pdf
    data['high_level_groups'] = data.major_group.map({
        11: 1, 12:1, 13:1,
        15:2, 16:2, 17:2, 18:2, 19:2,
        21:3, 22:3, 23:3, 24:3, 25:3, 26:3, 27:3,
        29:4,
        31:5, 32:5, 33:5, 34:5, 35:5, 36:5, 37:5, 38:5, 39:5,
        41:6,
        43:7,
        45:8,
        47:9,
        49:10,
        51:11,
        53:12,
        55:13})
    return data

data = adjust_data(data)

In [89]:
job_titles = pd.read_table('../data/onet_job_titles.txt')
job_titles = job_titles.drop('Description', axis=1)
job_titles = job_titles.rename(columns={'O*NET-SOC Code':'soc_code', 'Title': 'soc_title'})

In [122]:
len(data.major_group.unique())

22

In [90]:
data = pd.merge(data, job_titles)

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
import numpy as np

def get_random_grid(default=False):
    return {
        "n_estimators": [1200],
        "min_samples_split": [5],
        "min_samples_leaf": [1],
        "max_features": ["sqrt"],
        "max_depth": [30],
    }

classifier = RandomizedSearchCV(
                estimator=RandomForestClassifier(random_state=42),
                param_distributions=get_random_grid(),
                n_iter=45,
                cv=10,
                verbose=2,
                random_state=42,
                n_jobs=-1,
            )


vectorizer = TfidfVectorizer(analyzer="word", stop_words="english", strip_accents="ascii", ngram_range=(1,2))
vecotrized_string = vectorizer.fit_transform(data['job_title'])
classifier.fit(vecotrized_string, data['high_level_agg_groups'])
probability = classifier.predict_proba(vecotrized_string)


prediction = [classifier.classes_[np.where(x == max(x))][0] for x in probability]


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   55.7s remaining:   55.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   56.0s finished


In [96]:
def tfidf_random_forest_classifier(data, X_var, Y_var):
    vectorizer = TfidfVectorizer(analyzer="word", stop_words="english", strip_accents="ascii")
    vecotrized_string = vectorizer.fit_transform(data[X_var])
    classifier.fit(vecotrized_string, data[Y_var])
    probability = classifier.predict_proba(vecotrized_string)
    prediction = [classifier.classes_[np.where(x == max(x))][0] for x in probability]
    return vectorizer, classifier, prediction

vectorizer, model, prediction = tfidf_random_forest_classifier(data, 'job_title', 'major_group')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   34.9s remaining:   34.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   35.3s finished


In [108]:
from sklearn.pipeline import Pipeline
def get_random_grid(default=False):
    return {
        "n_estimators": [300, 1200],
        "min_samples_split": [3, 15],
        "min_samples_leaf": [1],
        "max_features": ["sqrt"],
        "max_depth": [25, 100],
    }


def create_pipeline():
    vectorizer = TfidfVectorizer(analyzer="word", stop_words="english", strip_accents="ascii")
    classifier = RandomizedSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_distributions=get_random_grid(),
        n_iter=45,
        cv=10,
        verbose=2,
        random_state=42,
        n_jobs=-1)
    pipe = Pipeline([('tfidf', vectorizer), ('random_forest', classifier)])
    return pipe




In [109]:
pipe.fit(data['job_title'], data['major_group'])

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   38.4s remaining:   38.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   38.7s finished


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words='english', strip_accents='ascii')),
                ('random_forest',
                 RandomizedSearchCV(cv=10,
                                    estimator=RandomForestClassifier(random_state=42),
                                    n_iter=45, n_jobs=-1,
                                    param_distributions={'max_depth': [30],
                                                         'max_features': ['sqrt'],
                                                         'min_samples_leaf': [1],
                                                         'min_samples_split': [5],
                                                         'n_estimators': [1200]},
                                    random_state=42, verbose=2))])

In [112]:
t= pipe['random_forest']

In [117]:
pipe.predict(data['job_title'])

array([11, 11, 11, ..., 51, 51, 51])

In [114]:
sum(data['major_group'] == prediction) / len(data)

0.5457805670738541

In [76]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(data['high_level_agg_groups'], prediction)
pd.DataFrame(conf_mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,695,180,11,1,13,5,6,0,1,2,2,0
1,48,1282,51,3,4,0,0,0,0,6,12,0
2,20,178,1017,2,1,0,7,0,0,0,12,1
3,18,168,24,596,14,0,2,0,0,0,0,0
4,44,237,21,4,615,1,4,0,0,3,11,0
5,33,72,16,0,4,89,9,0,0,0,1,0
6,34,183,37,0,22,2,297,0,0,1,40,1
7,18,89,2,0,11,0,0,10,2,0,34,2
8,4,167,0,0,4,0,0,0,282,11,67,3
9,4,153,3,2,4,0,0,0,9,326,11,0


In [73]:
classifier.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'auto',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_impurity_split': None,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': 42,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(random_state=42),
 'iid': 'deprecated',
 'n_iter': 45,
 'n_jobs': -1,
 'param_distributions': {'n_estimators': [500],
  'min_samples_split': [5],
  'min_samples_leaf': [1],
  'max_features': ['sqrt'],
  'max_depth': [50]},
 'pre_dispatch': '2*n_jobs',
 'random_state': 42,
 'refit': Tr