In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
from google.colab import files

# Upload a single file
uploaded = files.upload()

Saving UpdatedResumeDataSet.csv to UpdatedResumeDataSet.csv


In [3]:
resume_data = pd.read_csv('UpdatedResumeDataSet.csv')

In [4]:
def cleanResume(resumeText):
    resumeText = re.sub(r'https?://\S+', ' ', resumeText)  # remove URLs
    resumeText = re.sub(r'RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub(r'#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub(r'@\S+', ' ', resumeText)  # remove mentions
    resumeText = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^a-zA-Z0-9\s]', '', resumeText)  # remove non-alphanumeric characters except whitespace
    resumeText = re.sub(r'\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText.strip().lower()


In [5]:
resume_data['cleaned_resume'] = resume_data.Resume.apply(lambda x: cleanResume(x))

In [6]:
requiredText = resume_data['cleaned_resume']
requiredTarget = resume_data['Category']

In [7]:
label_mapping = {label: idx for idx, label in enumerate(set(requiredTarget))}
labels_numerical = [label_mapping[label] for label in requiredTarget]

In [8]:
# Convert text_data and labels_numerical to numpy arrays
text_data_array = np.array(requiredText)
labels_array = np.array(labels_numerical)

In [9]:
# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data_array)
vocab_size = len(tokenizer.word_index) + 1

# Convert text data to sequences and pad them to a maximum length
max_length = 500  # Define your desired maximum sequence length
sequences = tokenizer.texts_to_sequences(text_data_array)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_array, test_size=0.2, random_state=42, stratify=labels_array)


In [17]:
def build_Transformer(vocab_size, max_length, num_layers, num_heads, embed_dim, feed_forward_dim, dropout_rate, num_classes):
    inputs = tf.keras.layers.Input(shape=(max_length,))
    embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_dim, input_length=max_length)
    x = embedding_layer(inputs)

    for _ in range(num_layers):
        x = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads)(x, x)
        x = tf.keras.layers.Dropout(dropout_rate)(x)
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)

        x = tf.keras.layers.Dense(feed_forward_dim, activation='gelu')(x)
        x = tf.keras.layers.Dropout(dropout_rate)(x)
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [36]:
num_layers = 4
num_heads = 16
embed_dim = 128
feed_forward_dim = 512
dropout_rate = 0.1
num_classes = len(np.unique(labels_array))  # Number of unique classes


In [37]:
from keras.callbacks import ModelCheckpoint

model = build_Transformer(vocab_size, max_length, num_layers, num_heads, embed_dim, feed_forward_dim, dropout_rate, num_classes)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

In [39]:
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2,callbacks=[checkpoint]) # running this two times i.e. 100 epochs


Epoch 1/50
Epoch 1: val_accuracy improved from 0.85065 to 0.87662, saving model to best_model.h5
Epoch 2/50
Epoch 2: val_accuracy did not improve from 0.87662
Epoch 3/50
Epoch 3: val_accuracy improved from 0.87662 to 0.88961, saving model to best_model.h5
Epoch 4/50
Epoch 4: val_accuracy did not improve from 0.88961
Epoch 5/50
Epoch 5: val_accuracy did not improve from 0.88961
Epoch 6/50
Epoch 6: val_accuracy did not improve from 0.88961
Epoch 7/50
Epoch 7: val_accuracy did not improve from 0.88961
Epoch 8/50
Epoch 8: val_accuracy did not improve from 0.88961
Epoch 9/50
Epoch 9: val_accuracy did not improve from 0.88961
Epoch 10/50
Epoch 10: val_accuracy did not improve from 0.88961
Epoch 11/50
Epoch 11: val_accuracy did not improve from 0.88961
Epoch 12/50
Epoch 12: val_accuracy did not improve from 0.88961
Epoch 13/50
Epoch 13: val_accuracy did not improve from 0.88961
Epoch 14/50
Epoch 14: val_accuracy improved from 0.88961 to 0.90909, saving model to best_model.h5
Epoch 15/50
Epoch

<keras.src.callbacks.History at 0x7c0eb0192fe0>

In [40]:
from keras.models import load_model

best_model = load_model('best_model.h5')

# Evaluate the best model on the test data
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)



In [41]:
test_accuracy

0.984455943107605