# References

http://jalammar.github.io/illustrated-bert/
https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [7]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import transformers
from transformers import BertForSequenceClassification, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
from torch.optim import AdamW

from tensorflow.keras.utils import to_categorical

In [8]:
## load data
df = pd.read_csv("raw_data/job_postings_data/postings.csv")
map = pd.read_csv("raw_data/job_postings_data/mappings/industries.csv")
industries = pd.read_csv("raw_data/job_postings_data/jobs/job_industries.csv")

In [9]:
industry_map = {
    "Technology": [
        "Defense and Space Manufacturing", "Computer Hardware Manufacturing",
        "Software Development", "Computer Networking Products",
        "Technology, Information and Internet",
        "Telecommunications", "IT Services and IT Consulting",
        "Internet Marketplace Platforms", "Blockchain Services",
        "Desktop Computing Software Products", "IT System Custom Software Development",
        "Data Infrastructure and Analytics", "Social Networking Platforms",
        "Business Intelligence Platforms", "Digital Accessibility Services",
        "Internet News", "Internet Publishing","Technology and Software","Technology, Information and Media",
        "Information Technology and Services", "Computer Software", "Computer Networking","Computer and Network Security",
        "IT System Data Services","IT System Data Services","IT System Testing and Evaluation",
        "Information Services","Computer Games","Computer Hardware","Computer Networking Products",

    ],
    "Manufacturing": [
        "Consumer Electronics", "Medical Equipment Manufacturing",
        "Apparel Manufacturing", "Footwear Manufacturing",
        "Textile Manufacturing", "Furniture and Home Furnishings Manufacturing",
        "Beverage Manufacturing", "Pharmaceutical Manufacturing",
        "Sporting Goods Manufacturing", "Tobacco Manufacturing",
        "Plastics and Rubber Product Manufacturing", "Packaging and Containers Manufacturing",
        "Glass, Ceramics and Concrete Manufacturing", "Metal Valve, Ball, and Roller Manufacturing",
        "Robot Manufacturing", "Industrial Automation",
        "Transportation Equipment Manufacturing", "Oil and Gas",
        "Shipbuilding", "Chemical Manufacturing", "Mining",
        "Agricultural Chemical Manufacturing", "Paint, Coating, and Adhesive Manufacturing",
        "Electric Lighting Equipment Manufacturing", "Meat Products Manufacturing",
        "Wood Product Manufacturing", "Food and Beverage Manufacturing",
        "Machinery Manufacturing", "Construction Hardware Manufacturing",
        "Primary Metal Manufacturing", "Fabricated Metal Products",
        "HVAC and Refrigeration Equipment Manufacturing",
        "Engines and Power Transmission Equipment Manufacturing","Motor Vehicle Manufacturing",
        "Aerospace and Defense Manufacturing", "Electrical Equipment Manufacturing",
        "Aviation and Aerospace Component Manufacturing","Information Technology & Services",
        "Agriculture, Construction, Mining Machinery Manufacturing","Motor Vehicle Parts Manufacturing",
        "Renewable Energy Equipment Manufacturing","Semiconductor Manufacturing",
        "Magnetic and Optical Media Manufacturing",
        "Communications Equipment Manufacturing", "Audio and Video Equipment Manufacturing",
        "Renewable Energy Semiconductor Manufacturing","Mattress and Blinds Manufacturing",
        "Household and Institutional Furniture Manufacturing","Abrasives and Nonmetallic Minerals Manufacturing",
        "Industrial Machinery Manufacturing","Appliances, Electrical, and Electronics Manufacturing",
        "Automation Machinery Manufacturing", "Computers and Electronics Manufacturing","Plastics Manufacturing",

    ],
    "Healthcare and Biotechnology": [
        "Medical Practices", "Hospitals and Health Care",
        "Biotechnology Research", "Mental Health Care",
        "Medical Device", "Veterinary Services", "Nursing Homes and Residential Care Facilities",
        "Animal Feed Manufacturing", "Physical, Occupational and Speech Therapists",
        "Alternative Medicine", "Personal Care Product Manufacturing",
        "Cosmetics", "Pharmaceutical Manufacturing", "Dentists",
        "Medical and Diagnostic Laboratories", "Home Health Care Services","Health and Human Services",
        "Healthcare Services and Hospitals","Biotechnology","Pharmaceuticals",
        "Medical Devices","Healthcare Information Technology","Public Health","Hospitals",

    ],
    "Legal and Consulting Services": [
        "Law Practice", "Legal Services", "Business Consulting and Services",
        "Government Relations Services", "Strategic Management Services",
        "Alternative Dispute Resolution", "Public Policy Offices",
        "Environmental Services", "Operations Consulting",
    ],
    "Finance, Banking, Insurance and Accounting": [
        "Banking", "Insurance", "Real Estate",
        "Investment Banking", "Investment Management", "Capital Markets",
        "Venture Capital and Private Equity Principals", "Mortgage Services",
        "Credit Intermediation", "Loan Brokers", "Pension Funds",
        "Funds and Trusts", "Trusts and Estates","Accounting"
    ],
    "Real Estate, Property Management, and Construction": [
         "Leasing Non-residential Real Estate","Custruction","Real Estate",
        "Property Management",
    ],
    "Consumer Goods and Retail": [
        "Retail Apparel and Fashion", "Retail Groceries", "Retail Luxury Goods and Jewelry",
        "Online and Mail Order Retail", "Retail Motor Vehicles", "Retail Office Supplies and Gifts",
        "Retail Recyclable Materials & Used Merchandise", "Food and Beverage Retail",
        "Sporting Goods Manufacturing", "Retail Musical Instruments",
        "Retail Books and Printed News", "Retail Florists", "Tobacco Manufacturing",
        "Wholesale Import and Export", "Wholesale Luxury Goods and Jewelry",
        "Wholesale Food and Beverage", "Wholesale Chemical and Allied Products",
        "Wholesale Raw Farm Products","Retail","Retail Health and Personal Care Products",
        "Retail Pharmacies"
    ],
    "Entertainment and Media": [
        "Entertainment Providers", "Movies, Videos, and Sound",
        "Broadcast Media Production and Distribution", "Performing Arts",
        "Gambling Facilities and Casinos", "Artists and Writers", "Online Audio and Video Media",
        "Museums, Historical Sites, and Zoos", "Spectator Sports",
        "Golf Courses and Country Clubs", "Amusement Parks and Arcades",
        "Animation and Post-production", "Media Production", "Online Media",
        "Writers and Editors", "Theater Companies","Performing Arts and Spectator Sports",
    ],
    "Transportation and Logistics": [
        "Freight and Package Transportation", "Truck Transportation",
        "Rail Transportation", "Airlines and Aviation", "Urban Transit Services",
        "Transportation/Trucking/Railroad", "Pipeline Transportation",
        "Warehousing and Storage", "Ground Passenger Transportation",'Transportation, Logistics, Supply Chain and Storage',

    ],
    "Education and Research": [
        "Primary and Secondary Education", "Higher Education",
        "Education Administration Programs", "Research Services",
        "Think Tanks", "Technical and Vocational Training",
        "Non-profit Organizations", "Philanthropic Fundraising Services","Education",
        "E-Learning Providers","Education Management"
    ],
    "Government and Public Administration": [
        "Government Administration", "Public Safety", "Legislative Offices",
        "International Affairs", "Military and International Affairs",
        "Administration of Justice", "Public Policy Offices",
        "Courts of Law", "Correctional Institutions", "Housing Programs","Armed Forces",
        "Law Enforcement","Public Administration","Public Safety","International Affairs",

    ],
    "Environmental and Renewable Energy": [
        "Environmental Services", "Horticulture", "Renewables & Environment",
        "Solar Electric Power Generation", "Climate Data and Analytics",
        "Wind Electric Power Generation", "Climate Technology Product Manufacturing",
        "Conservation Programs",
    ],
    "Construction and Real Estate Development": [
        "Building Construction", "Residential Building Construction",
        "Nonresidential Building Construction", "Utility System Construction",
        "Specialty Trade Contractors", "Architecture and Planning",
        "Surveying and Mapping Services", "Civil Engineering","Construction and Real Estate Development",
        "Wholesale Building Materials","Construction","Water, Waste, Steam, and Air Conditioning Services",
    ],
    "Hospitality, Travel, and Food Service" : [
        "Restaurants and Food Service", "Hospitality", "Food Production",
        "Food and Beverage Manufacturing", "Food and Beverage Retail","Restaurants",
        "Food Production", "Food & Beverages", "Travel Arrangements","Food and Beverage Services",
        "Bed-and-Breakfasts, Hostels, Homestays","Wineries", "Caterers","Events Services",

    ],
    "Ambiguous or Placeholder Entries": [
        "nan", "Programs", "Non-descriptive placeholders from the list"
    ]
}

In [10]:
def flatten(xss):
    return [x for xs in xss for x in xs]

list=industry_map.values()
industry_list=len(flatten(list))
# Create a reverse mapping from industry names to their corresponding keys
reverse_industry_map = {industry: key for key, industries in industry_map.items() for industry in industries}
reverse_industry_map["nan"]
# Replace the values in map.industry_name with their "corresponding keys
map["sub_industry_name"]=map["industry_name"]
map["industry_name"]=map['industry_name'].map(lambda x : reverse_industry_map[x] if x in reverse_industry_map.keys() else x)
df = df.merge(industries.merge(map,how="left", on ="industry_id").set_index("industry_id"),how="left", on ="job_id")
big_industries=df.industry_name.value_counts()[df.industry_name.value_counts()>3000].index.tolist()
df = df[df['industry_name'].isin(big_industries)]
df= df.dropna(subset=["description"])

In [39]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
device = torch.device("cpu")

In [None]:
## cleaning of the text data
sw = stopwords.words('english')

def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text

In [17]:
df['description'] = df['description'].apply(lambda x: clean_text(x))

In [19]:

encoder= LabelEncoder()
encoder.fit(df["industry_name"])
df["labels"]= encoder.transform(df["industry_name"])
#y

In [23]:
tweets = df.description.values
labels = df.labels.to_numpy()
labels


array([6, 7, 6, ..., 2, 8, 8])

In [24]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [25]:
print(' Original: ', tweets[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(tweets[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

 Original:  national exemplar accepting applications assistant restaurant manager offer highly competitive wages, healthcare, paid time off, complimentary dining privileges bonus opportunities serious, professional, long standing neighborhood restaurant years service looking long term fit best class organization apply please send resumes pardom nationalexemplarcom
Tokenized:  ['national', 'ex', '##em', '##pl', '##ar', 'accepting', 'applications', 'assistant', 'restaurant', 'manager', 'offer', 'highly', 'competitive', 'wages', ',', 'healthcare', ',', 'paid', 'time', 'off', ',', 'compliment', '##ary', 'dining', 'privileges', 'bonus', 'opportunities', 'serious', ',', 'professional', ',', 'long', 'standing', 'neighborhood', 'restaurant', 'years', 'service', 'looking', 'long', 'term', 'fit', 'best', 'class', 'organization', 'apply', 'please', 'send', 'resume', '##s', 'par', '##dom', 'nationale', '##x', '##em', '##pl', '##ar', '##com']
Token IDs:  [2120, 4654, 6633, 24759, 2906, 10564, 5097,

In [26]:
max_len = 0

# For every sentence...
for sent in tweets:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
max_len = min(max_len,512)
print('Max sentence length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  512


In [27]:
input_ids = []
attention_masks = []

# For every tweet...
for tweet in tweets:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        tweet,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        padding = "max_length",
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True,
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', tweets[0])
print('Token IDs:', input_ids[0])

Original:  national exemplar accepting applications assistant restaurant manager offer highly competitive wages, healthcare, paid time off, complimentary dining privileges bonus opportunities serious, professional, long standing neighborhood restaurant years service looking long term fit best class organization apply please send resumes pardom nationalexemplarcom
Token IDs: tensor([  101,  2120,  4654,  6633, 24759,  2906, 10564,  5097,  3353,  4825,
         3208,  3749,  3811,  6975, 12678,  1010,  9871,  1010,  3825,  2051,
         2125,  1010, 19394,  5649,  7759, 14310,  6781,  6695,  3809,  1010,
         2658,  1010,  2146,  3061,  5101,  4825,  2086,  2326,  2559,  2146,
         2744,  4906,  2190,  2465,  3029,  6611,  3531,  4604, 13746,  2015,
        11968,  9527, 17360,  2595,  6633, 24759,  2906,  9006,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,

In [28]:

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.7 * len(dataset))
#val_size = int(0.2 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

94,101 training samples
40,329 validation samples


In [29]:

# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [40]:

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 12, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.
#     model = model.cuda()
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8.

                )

# Fine tuning the model

In [None]:

# Number of training epochs.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [44]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [45]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [46]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


KeyboardInterrupt: 

# Loading the best model

In [None]:
model = torch.load('bert_model')