# Imports

In [25]:
import pandas as pd
import numpy as np
import glob
import re
import string
from bs4 import BeautifulSoup

# sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

# visualization 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yhadad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
df = pd.read_csv('datasets\processed_data.csv')

In [27]:
df.head()

Unnamed: 0,Description,JobTitle
0,"['avis', 'budget', 'group', 'actionpacked', 'h...",Automotive Technician
1,"['position', 'licensed', 'practical', 'nurse',...",Nurse Practitioner (NP)
2,"['service', 'technician', 'every', 'employee',...",Automotive Technician
3,"['avis', 'budget', 'group', 'actionpacked', 'h...",Automotive Technician
4,"['job', 'purpose', 'nurse', 'practitioner', 'p...",Nurse Practitioner (NP)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['JobTitle'], test_size=0.3,
                                                    random_state=42, stratify=df['JobTitle'])

## RandomForestClassifier

In [30]:
rf = RandomForestClassifier()

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', rf)
])

In [31]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 15, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

On each iteration, the algorithm will choose a difference combination of the features. Altogether, there are 5 * 2 * 6 * 3 * 3 * 2 = 1080 settings! However, the benefit of a random search is that we are not trying every combination, but selecting at random to sample a wide range of values.

In [32]:
# hyperparameter tuning
def grid_search():
    param_grid = {
        'model__n_estimators': n_estimators,
        'model__max_features': max_features,
        'model__max_depth': max_depth,
        'model__min_samples_split': min_samples_split,
        'model__min_samples_leaf': min_samples_leaf,
        'model__bootstrap': bootstrap
    }

    search = RandomizedSearchCV(pipline, param_grid, cv=5, n_jobs=-1)
    search.fit(X_train,y_train)

    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

In [33]:
import time

start = time.time()

grid_search()

end = time.time()
print('execution time in minutes: ', (end - start)/60) 

Best parameter (CV score=0.780):
{'model__n_estimators': 800, 'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__max_features': 'auto', 'model__max_depth': None, 'model__bootstrap': True}
execution time in minutes:  19.17095280488332


In [45]:
# adding best hyperparameter

rf = RandomForestClassifier(min_samples_split = 10, max_depth=None, n_estimators = 800, min_samples_leaf = 2,
                            max_features = 'auto',bootstrap = True)

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', rf)
])

In [46]:
pipline.fit(X_train,y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model',
                 RandomForestClassifier(min_samples_leaf=2,
                                        min_samples_split=10,
                                        n_estimators=800))])

In [47]:
predictions = pipline.predict(X_test)

In [48]:
print(classification_report(y_test, predictions))

                                                precision    recall  f1-score   support

                                    Accountant       0.50      0.22      0.31         9
     Accounts Payable or Receivable Specialist       0.60      0.88      0.71        17
                           Addiction Counselor       0.71      0.62      0.67         8
                      Administrative Assistant       0.51      0.82      0.63        22
                            Analytical Chemist       1.00      0.25      0.40         4
                              Anesthesiologist       0.90      0.56      0.69        16
                                  Art Director       1.00      0.60      0.75         5
                                     Assembler       0.62      0.83      0.71        84
                  Assistant Restaurant Manager       0.65      0.65      0.65        31
                       Assistant Store Manager       0.57      0.81      0.67        16
                          Autis

  _warn_prf(average, modifier, msg_start, len(result))
