### Dependencies

In [3]:

import time
import zipfile

import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import word2vec, Word2Vec
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from torch.utils import data
from torch.utils.data import DataLoader
from tqdm import tqdm

### Dataset

#### Load Dataset

In [4]:
base_dir = "../../"
data_dir = base_dir + "data/"
model_dir = base_dir + "model/"

with zipfile.ZipFile(data_dir + "jobs.csv.zip", 'r') as zip_ref:
    zip_ref.extractall(data_dir)
job_set = pd.read_csv(data_dir + "jobs.csv")
user_set = pd.read_csv(data_dir + "users.csv")
dataset = pd.read_csv(data_dir + "dataset.csv")
work_history = pd.read_csv(data_dir + "history.csv")
ranking_data = pd.read_csv(data_dir + "ranking.csv")
jobs_segment_file = data_dir + "jobs_segment.csv"

embedding_path = model_dir + "embedding.pt"
model_path = model_dir + "word2vec.model"
trained_model_path = model_dir + "textCNN_ckpt.model"

#### Text Preprocessing

In [5]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

punctuation = list('。，？！：%&~（）、；“”&|,.?!:%&~();""#@【】/-\'$+*`[]{}()')
stop_words = stopwords.words("english")
stop_words.extend(["n't", "wo", "'m", "'s", "'ve", "'d", "'ll", "``", "''", "--", "..."])
stop_words.extend(punctuation)
wordnet_lematizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/alper/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/alper/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/alper/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/alper/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def pretreatment(comment):
    '''
    remove punctuations, numbers and urls
    lower case conversion
    remove stop words
    lemmatization
    '''

    token_words = word_tokenize(comment)
    token_words = [w.lower() for w in token_words]
    token_words = [w for w in token_words if w not in stop_words]
    token_words = pos_tag(token_words)
    cleaned_word = []
    for word, tag in token_words:
        if word.isdigit():
            continue
        if tag.startswith('NN'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='n')  # n for noun
        elif tag.startswith('VB'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='v')  # v for verb
        elif tag.startswith('JJ'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='a')  # a for adjective
        elif tag.startswith('R'):
            word_lematizer = wordnet_lematizer.lemmatize(word, pos='r')  # r for pronoun
        else:
            word_lematizer = wordnet_lematizer.lemmatize(word)
        cleaned_word.append(word_lematizer)

    return cleaned_word

#### Load Job File

In [7]:
segment = []
job_set = job_set.fillna("")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements
for content in tqdm(job_set["word"].values):
    segment.append(pretreatment(content))
job_set["text"] = segment
job_set.to_csv(jobs_segment_file, index=False)

100%|██████████| 115684/115684 [09:52<00:00, 195.28it/s]


### Training

#### Training Classes and Functions

In [8]:
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path):
        '''
        param: sentences: the list of corpus
               sen_len: the max length of each sentence
               w2v_path: the path storing word emnbedding model
        '''

        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        if w2v_path:
            self.embedding = Word2Vec.load(self.w2v_path)
            self.embedding_dim = self.embedding.vector_size
        else:
            self.embedding = None
            self.embedding_dim = None

    def add_embedding(self, word):
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        if self.embedding_matrix is None:
            vector = torch.zeros(100)
            self.embedding_matrix = vector.unsqueeze(0)
        else:
            vector = torch.zeros(self.embedding_matrix.size(1))
            self.embedding_matrix = torch.cat([self.embedding_matrix, vector.unsqueeze(0)], dim=0)
        #vector = torch.empty(1, self.embedding_dim)
        #torch.nn.init.uniform_(vector)

    def make_embedding(self, load=True):
        print("Get embedding ...")
        if load:
            print("loading word2vec model ...")
            self.embedding = Word2Vec.load(self.w2v_path)
            self.embedding_matrix = []
            for word in self.embedding.wv.index_to_key:
                self.word2idx[word] = len(self.word2idx)
                self.idx2word.append(word)
                self.embedding_matrix.append(self.embedding.wv[word])
            self.embedding_matrix = torch.tensor(self.embedding_matrix)
        else:
            raise NotImplementedError
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx['<PAD>'])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self):
        '''
        change words in sentences into idx in embedding_matrix
        '''
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx['<UNK>'])
            sentence_idx = self.pad_sentence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

    def labels_to_tensor(self, y):
        return torch.LongTensor(y)


class TextCNN(nn.Module):
    def __init__(self, channels, kernel_size, pool_size, dim, method='max'):
        super(TextCNN, self).__init__()
        self.net1 = nn.Sequential(
            nn.Conv2d(1, channels, kernel_size[0]),
            nn.BatchNorm2d(channels),
            nn.ReLU(),
            nn.MaxPool2d(pool_size)
        )
        self.net2 = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size[1]),
            nn.BatchNorm2d(channels),
            nn.ReLU(),
            nn.AdaptiveMaxPool2d((1, dim))
        )
        self.sigmoid = nn.Sigmoid()
        if method is 'max':
            self.pool = nn.AdaptiveMaxPool2d((1, dim))
        elif method is 'mean':
            self.pool = nn.AdaptiveAvgPool2d((1, dim))
        else:
            raise ValueError('method {} not exist'.format(method))

    def forward(self, x):
        x = self.net1(x)
        x = self.net2(x).squeeze(2)
        x = self.pool(x).squeeze(1)
        x = self.sigmoid(x)
        return x


class MLP(nn.Module):
    def __init__(self, input_size, output_size, dropout):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.Linear(input_size, output_size),
            nn.Sigmoid()

        )

    def forward(self, x):
        x = self.net(x)
        return x


class PJFNN(nn.Module):
    def __init__(self, embedding, input_dim, channels=1, dropout=0.5, fix_embedding=True):
        super(PJFNN, self).__init__()
        self.dim = embedding.size(1)
        self.user_dim = input_dim
        self.channels = channels
        self.embedding = nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True
        # self.emb = nn.Embedding.from_pretrained(
        #    torch.from_numpy(np.load( os.path.join(args['dataset']['path'], 'emb.npy') )),
        #    freeze=False,
        #    padding_idx=0
        #)

        # self.geek_layer = TextCNN(
        #     channels=args['dataset']['max_sent_num']['geek'],
        #     kernel_size=[(5, 1), (3, 1)],
        #     pool_size=(2, 1),
        #     dim=dim,
        #     method='max'
        # )
        self.user_layer = MLP(self.user_dim, 64, dropout=dropout)
        self.linear_transform = nn.Linear(200, 64)
        self.job_layer = TextCNN(
            channels=self.channels,
            kernel_size=[(5, 1), (5, 1)],
            pool_size=(2, 1),
            dim=200,
            method='mean'
        )

        self.mlp = MLP(
            input_size=128,
            output_size=1,
            dropout=dropout
        )

    def forward(self, job, user):
        job = self.embedding(job)
        job = job.unsqueeze(1)
        job = self.job_layer(job)
        user = self.user_layer(user)
        job = self.linear_transform(job)
        x = torch.cat((user, job), dim=1)
        x = self.mlp(x).squeeze(1)
        return x


class PJFNN_LSTM(nn.Module):
    def __init__(self, embedding, input_dim, hidden_dim=256, num_layers=1, dropout=0., fix_embedding=True):
        super(PJFNN_LSTM, self).__init__()
        self.embedding_dim = embedding.size(1)
        self.user_dim = input_dim
        self.embedding = nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True
        # self.emb = nn.Embedding.from_pretrained(
        #    torch.from_numpy(np.load( os.path.join(args['dataset']['path'], 'emb.npy') )),
        #    freeze=False,
        #    padding_idx=0
        #)

        # self.geek_layer = TextCNN(
        #     channels=args['dataset']['max_sent_num']['geek'],
        #     kernel_size=[(5, 1), (3, 1)],
        #     pool_size=(2, 1),
        #     dim=dim,
        #     method='max'
        # )
        self.user_layer = MLP(self.user_dim, 64, dropout=dropout)
        self.linear_transform = nn.Linear(256, 64)
        self.job_layer = nn.LSTM(self.embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True,
                                 dropout=dropout)

        self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(128, 1), nn.Sigmoid())

    def forward(self, job, user):
        job = self.embedding(job)
        job, _ = self.job_layer(job, None)
        job = job[:, -1, :]
        user = self.user_layer(user)
        job = self.linear_transform(job)
        x = torch.cat((user, job), dim=1)
        x = self.classifier(x).squeeze(1)
        return x


class JobUserDataset(data.Dataset):
    '''
    Expected data shape like:(data_num, data_len)
    '''

    def __init__(self, job, user, label):
        self.job = job
        self.user = user
        self.label = label

    def __getitem__(self, idx):
        if self.label is None: return self.job[idx], self.user[idx]
        return self.job[idx], self.user[idx], self.label[idx]

    def __len__(self):
        return min(len(self.job), len(self.user), len(self.label) if self.label is not None else float('inf'))

  if method is 'max':
  elif method is 'mean':


In [9]:
def training(batch_size, n_epoch, lr, train, valid, model, device, model_name, model_dir):
    # summary model parameters
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("\nstart training, total parameter:{}, trainable:{}\n".format(total, trainable))
    #model.cuda()
    model.to(device)
    model.train()
    criterion = nn.BCELoss()
    t_batch = len(train)
    print("batch size:{}, epoch:{}, t_batch:{}".format(batch_size, n_epoch, t_batch))
    v_batch = len(valid)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    total_loss, total_acc, best_acc = 0, 0, 0
    train_losses, val_losses = [], []
    train_acc, val_acc = [], []
    pred_label = []
    y_label = []

    for epoch in range(n_epoch):
        start_time = time.time()
        total_loss, total_acc = 0.0, 0
        # training
        print("epoch-{} training has started.".format(epoch))
        for i, (jobs, users, labels) in enumerate(train):
            jobs = jobs.to(device)
            users = users.to(torch.float32)
            users = users.to(device)
            labels = labels.to(device)
            labels = labels.to(torch.float32)
            model.zero_grad()
            optimizer.zero_grad()
            outputs = model(jobs, users)
            #print(f"Raw model outputs: {outputs}")
            #outputs = torch.sigmoid(outputs)
            #print(f"Sigmoid model outputs: {outputs}")
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pred_label.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
            y_label.extend(list(labels.cpu().detach().numpy()))
        train_losses.append(total_loss / t_batch)
        train_acc.append(accuracy_score(y_label, pred_label))
        print('[ Epoch{}: {}/{}] '.format(
            epoch + 1, i + 1, t_batch))
        print('\nTrain | Loss:{:.5f} Time:{:.6f}'.format(total_loss / t_batch, time.time() - start_time))

        print("epoch-{} evaluation has started.".format(epoch))
        # evaluation
        model.eval()
        with torch.no_grad():
            pred_label = []
            y_label = []
            total_loss, total_acc = 0.0, 0
            for i, (jobs, users, labels) in enumerate(valid):
                jobs = jobs.to(device)
                users = users.to(torch.float32)
                users = users.to(device)
                labels = labels.to(torch.float32)
                labels = labels.to(device)
                outputs = model(jobs, users)
                #outputs = torch.sigmoid(outputs)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                pred_label.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
                y_label.extend(list(labels.cpu().detach().numpy()))
            # print('\nVal | Loss:{:.5f} Time:{:.6f}'.format(total_loss/v_batch, time.time()-start_time))
            val_losses.append(total_loss / v_batch)
            total_acc = accuracy_score(y_label, pred_label)
            val_acc.append(total_acc)
            print('\nVal | ACC:{:.5f} Time:{:.6f}'.format(total_acc, time.time() - start_time))
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model, "{}/{}_ckpt.model".format(model_dir, model_name))
                print('save model with acc {:.3f}'.format(total_acc))

        print("epoch-{} has finished.".format(epoch))
        print('------------------------------------------------------')
        model.train()
    return train_losses, val_losses, train_acc, val_acc


def test(model, loader):
    predictions = []
    y_labels = []
    pred = []
    model.eval()
    for i, (jobs, users, labels) in enumerate(loader):
        jobs = jobs.to(device)
        users = users.to(device, dtype=torch.float32)
        labels = labels.to(device, dtype=torch.float32)
        outputs = model(jobs, users)
        pred.extend(list(outputs.cpu().detach().numpy()))
        predictions.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
        y_labels.extend(list(labels.cpu().detach().numpy()))
        report = classification_report(y_labels, predictions, digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    result = pd.DataFrame()
    col_1.append("overall")
    col_2.append(precision_score(y_labels, predictions))
    col_3.append(recall_score(y_labels, predictions))
    col_4.append(f1_score(y_labels, predictions))
    col_5.append(roc_auc_score(y_labels, pred))
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    #     print(result)
    return result

#### Train Model

In [10]:
w2v_model = word2vec.Word2Vec(job_set.text.values, vector_size=200, window=5, min_count=2, workers=8, epochs=10, sg=1)
w2v_model.save(model_path)
w2v_model = Word2Vec.load(model_path)
word_history_tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

### Build Dataset

In [11]:
train_user = user_set[user_set.Split == "Train"].UserID.values
test_user = user_set[user_set.Split == "Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]
job_set = pd.read_csv(jobs_segment_file)
text = []
for i in job_set.text:
    temp = i[1:-1].split(',')
    text.append([t.strip()[1:-1] for t in temp])
job_set["text"] = text

In [12]:
groups = train_data.groupby("UserID")
job_train = []
user_train = np.zeros((1, 58))
Y_train = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID == u_id][
        ["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed",
         "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx, :].toarray()), axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    job_train.extend(jobs.text.values.tolist())
    user_feature = user_feature.repeat(len(jobs), axis=0)
    user_feature = np.concatenate((user_feature, group[["State", "City"]].values), axis=1)
    user_train = np.concatenate((user_train, user_feature), axis=0)
    Y_train.extend(group.label.values.tolist())

100%|██████████| 18486/18486 [00:40<00:00, 452.74it/s]


In [13]:
groups = test_data.groupby("UserID")
job_test = []
user_test = np.zeros((1, 58))
Y_test = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID == u_id][
        ["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed",
         "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx, :].toarray()), axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    job_test.extend(jobs.text.values.tolist())
    user_feature = user_feature.repeat(len(jobs), axis=0)
    user_feature = np.concatenate((user_feature, group[["State", "City"]].values), axis=1)
    user_test = np.concatenate((user_test, user_feature), axis=0)
    Y_test.extend(group.label.values.tolist())

100%|██████████| 260/260 [00:00<00:00, 760.75it/s]


In [14]:
train_len = len(job_train)
job_train.extend(job_test)
Y_train.extend(Y_test)
sen_len = 200
preprocess = Preprocess(job_train, sen_len, w2v_path=model_path)
embedding = preprocess.make_embedding(load=True)
x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(Y_train)
torch.save(embedding, embedding_path)

Get embedding ...
loading word2vec model ...


  self.embedding_matrix = torch.tensor(self.embedding_matrix)


total words: 108123


In [15]:
train_x = x[:70000]
train_y = y[:70000]
val_x = x[70000:70680]
val_y = y[70000:70680]
test_x = x[70680:]
test_y = y[70680:]
train_user = torch.from_numpy(user_train[1:70001])
val_user = torch.from_numpy(user_train[70001:])
test_user = torch.from_numpy(user_test[1:])

train_user = torch.nan_to_num(train_user)
val_user = torch.nan_to_num(val_user)
test_user = torch.nan_to_num(test_user)

train_dataset = JobUserDataset(train_x, train_user, train_y)
val_dataset = JobUserDataset(val_x, val_user, val_y)
test_dataset = JobUserDataset(test_x, test_user, test_y)
# torch.save(train_dataset,"train.dataset")
# torch.save(val_dataset,"val.dataset")
# torch.save(test_dataset, "test.dataset")

# train_dataset = torch.load("train.dataset")
# val_dataset = torch.load("val.dataset")
# test_dataset = torch.load("test.dataset")
# embedding = torch.load(embedding_path)
batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


### Train and Test

In [16]:
fix_embedding = False
input_dim = train_dataset[0][1].shape[0]
model = PJFNN(embedding, input_dim, dropout=0.7, channels=32, fix_embedding=fix_embedding)
#epoch = 10
epoch = 2
lr = 0.0005
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")

train_losses, val_losses, train_acc, val_acc = training(batch_size, epoch, lr, train_loader, val_loader, model, device,
                                                        "textCNN", model_dir)


start training, total parameter:21666775, trainable:21666775

batch size:32, epoch:2, t_batch:2188
epoch-0 training has started.
[ Epoch1: 2188/2188] 

Train | Loss:0.69497 Time:146.327729
epoch-0 evaluation has started.

Val | ACC:0.48653 Time:146.668194
save model with acc 0.487
epoch-0 has finished.
------------------------------------------------------
epoch-1 training has started.
[ Epoch2: 2188/2188] 

Train | Loss:0.69360 Time:144.505541
epoch-1 evaluation has started.

Val | ACC:0.50449 Time:144.811056
save model with acc 0.504
epoch-1 has finished.
------------------------------------------------------


In [17]:
test(torch.load(trained_model_path), test_loader)

  test(torch.load(trained_model_path), test_loader)


——————Test——————


Unnamed: 0,class,precision,recall,f1-score,support
0,0.0,0.5032,0.9119,0.6485,522.0
1,1.0,0.5208,0.0962,0.1623,520.0
2,accuracy,,,0.5048,1042.0
3,macro avg,0.512,0.504,0.4054,1042.0
4,weighted avg,0.512,0.5048,0.4059,1042.0
5,overall,0.520833,0.096154,0.162338,0.522985


### Recommendation

In [18]:
groups = ranking_data.groupby("UserID")
job_rank = []
user_rank = np.zeros((1, 58))
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID == u_id][
        ["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", "ManagedOthers",
         "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx, :].toarray()), axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    job_rank.extend(jobs.text.values.tolist())
    user_feature = user_feature.repeat(len(jobs), axis=0)
    user_feature = np.concatenate((user_feature, group[["State", "City"]].values), axis=1)
    user_rank = np.concatenate((user_rank, user_feature), axis=0)

100%|██████████| 260/260 [00:00<00:00, 327.05it/s]


In [19]:
sen_len = 200
preprocess = Preprocess(job_rank, sen_len, w2v_path=model_path)
embedding = preprocess.make_embedding(load=True)
rank_x = preprocess.sentence_word2idx()
rank_dataset = JobUserDataset(rank_x, user_rank, None)
rank_loader = DataLoader(dataset=rank_dataset, batch_size=100, shuffle=False)
num_user = len(ranking_data.UserID.unique())
m = torch.load(trained_model_path)

Get embedding ...
loading word2vec model ...
total words: 108123


  m = torch.load(trained_model_path)


### Evaluation

#### Evaluation Functions

In [20]:
def test_hit_rate(model, k, rank_loader):
    hit = 0
    num_users = 0
    model.eval()
    for jobs, users in rank_loader:
        jobs = jobs.to(device)
        users = users.to(torch.float32)
        users = users.to(device)
        outputs = model(jobs, users)
        pred = outputs.cpu().detach().numpy()
        if pred.size == 0:
            continue
        a = -np.sort(-pred)
        matches = np.argwhere(a == pred[0])
        if matches.size == 0:
            continue
        idx = np.argwhere(a == pred[0])[0][0]
        if idx <= k - 1:
            hit += 1
        num_users += 1

    return hit / num_user


def test_ndcg(model, k, rank_loader):
    ndcg_sum = 0
    num_users = 0
    model.eval()

    for jobs, users in rank_loader:
        jobs = jobs.to(device)
        users = users.to(torch.float32).to(device)
        outputs = model(jobs, users)
        predictions = outputs.cpu().detach().numpy()

        if len(predictions) == 0:
            continue

        relevance = [1 if pred == predictions[0] else 0 for pred in predictions]
        sorted_relevance = [relevance[i] for i in np.argsort(-predictions)]

        # DCG
        dcg = 0
        for i in range(k):
            if i < len(sorted_relevance):
                dcg += sorted_relevance[i] / np.log2(i + 2)  # log2(i + 2) because of 1-based indexing

        # IDCG
        ideal_relevance = sorted(sorted_relevance, reverse=True)
        idcg = 0
        for i in range(k):
            if i < len(ideal_relevance):
                idcg += ideal_relevance[i] / np.log2(i + 2)

        # NDCG
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_sum += ndcg
        num_users += 1

    return ndcg_sum / num_users if num_users > 0 else 0

#### Evaluation Results

In [21]:
ndcg_5 = test_ndcg(model, 5, rank_loader)
ndcg_10 = test_ndcg(model, 10, rank_loader)

print("\n" + "-" * 30)
print(f"{'Metric':<15}{'Score':<10}")
print("-" * 30)
print(f"{'nDCG@5':<15}{ndcg_5:<10.4f}")
print(f"{'nDCG@10':<15}{ndcg_10:<10.4f}")
print("-" * 30)


------------------------------
Metric         Score     
------------------------------
nDCG@5         0.1980    
nDCG@10        0.2101    
------------------------------


In [22]:
hits_5 = test_hit_rate(model, 5, rank_loader)
hits_10 = test_hit_rate(model, 10, rank_loader)

print("\n" + "-" * 30)
print(f"{'Metric':<15}{'Score':<10}")
print("-" * 30)
print(f"{'hits@5':<15}{hits_5:<10.4f}")
print(f"{'hits@10':<15}{hits_10:<10.4f}")
print("-" * 30)


------------------------------
Metric         Score     
------------------------------
hits@5         0.2308    
hits@10        0.2692    
------------------------------
