In [11]:
# import libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [12]:
# read the data
train=pd.read_csv('train_preprocessed.csv')
test=pd.read_csv('test_preprocessed.csv')

In [13]:
display(train.head())
display(test.head())

Unnamed: 0,Sentence_id,New_Sentence,Type
0,GERRES15609,Author and Review architecture and other techn...,Responsibility
1,PHERES15784,Should be able to develop custom dynamic shape...,Responsibility
2,GERREQ10457,Experience in working cross with a larger Engi...,Requirement
3,GERSKL27235,"Previous business experience, including but no...",Skill
4,HONSSK18415,Delivering fast and right the first time.',SoftSkill


Unnamed: 0,Sentence_id,New_Sentence
0,UAERES16346,Collaborate across all of DB&T practices and o...
1,COGREQ15586,Strong technology expertise in Identity and Ac...
2,UAEREQ12722,Strong knowledge on Service Virtualization.'
3,COGSKL29155,Architect scalable data processing and analyti...
4,PHERES12551,"Map client organization, build outstanding rel..."


In [14]:
# drop Sentence_id
train.drop('Sentence_id',axis=1,inplace=True)

Sentence_id = test['Sentence_id']
test.drop('Sentence_id',axis=1,inplace=True)

In [15]:
train.Type.unique()

array(['Responsibility', 'Requirement', 'Skill', 'SoftSkill', 'Education',
       'Experience'], dtype=object)

In [16]:
# Change categorical data to numerical data
train.replace({'Type' : { 'Responsibility' : 0, 'Requirement' : 1, 'Skill' : 2,
                         'SoftSkill' : 3, 'Education' : 4, 'Experience' : 5}},inplace=True)

In [17]:
# import transformers and tokenizers
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [18]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [19]:
train.dropna(inplace = True)

In [20]:
# zero array of shape (len(train),256)
X_input_ids = np.zeros((len(train), 256))
X_attn_masks = np.zeros((len(train), 256))

In [21]:
def generate_training_data(train, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(train['New_Sentence'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [22]:
# generate input ids and attention masks
X_input_ids, X_attn_masks = generate_training_data(train, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [24]:
train.Type.unique().shape

(6,)

In [25]:
# Create a zeor array for label 
labels = np.zeros((len(train), 6))
labels.shape

(59002, 6)

In [26]:
# one hot encoding of labels
labels[np.arange(len(train)), train['Type'].values] = 1

In [27]:
# creating a data pipeline using tensorflow dataset utility
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [28]:
def datasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [29]:
# creating dataset with maping input ids, attention masks and labels
dataset = dataset.map(datasetMapFunction)

# split dataset into train and validation
dataset = dataset.shuffle(1000).batch(16, drop_remainder=True)

p = 0.8
train_size = int((len(train)//16)*p)

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [30]:
# import BERt model
from transformers import TFBertModel

In [25]:
# load bert base model
model = TFBertModel.from_pretrained('bert-base-cased') 

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [26]:
# create a model
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1]
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(6, activation='softmax', name='output_layer')(intermediate_layer)

model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
intermediate_layer (Dense)      (None, 512)          393728      bert[0][1]                   

In [27]:
# compile model with optimizer, loss and metrics
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [None]:
# train model
history = model.fit( train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5


2022-11-13 18:46:44.644012: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5
 106/2949 [>.............................] - ETA: 22:54 - loss: 0.4166 - accuracy: 0.8384

In [None]:
# save model weights
model.save('ford-sentence-classifiaction')

In [None]:
# load model weights
model = tf.keras.models.load_model('ford-sentence-classifiaction')

In [None]:
# prepare test data
X_input_ids = np.zeros((len(test), 256))
X_attn_masks = np.zeros((len(test), 256))

X_input_ids, X_attn_masks = generate_training_data(test, X_input_ids, X_attn_masks, tokenizer)

# create a data pipeline for test data
test_dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks))

def datasetMapFunction(input_ids, attn_masks):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }

# create dataset with maping input ids and attention masks
test_dataset = test_dataset.map(datasetMapFunction)

In [None]:
# predict on test data
pred = model.predict(test_dataset)

In [None]:
# one hot to categorical

classes = ['Responsibility', 'Requirement', 'Skill', 'SoftSkill', 'Education',
           'Experience']

pred = np.argmax(pred, axis=1)

In [None]:
# create submission file
submission = pd.read_csv('sample_submission.csv')
submission['Sentence_id'] = Sentence_id
submission['Type'] = pred
submission['Type'] = submission['Type'].apply(lambda x: classes[x])
submission.to_csv('submission.csv', index=False)