In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import string
import pickle
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [2]:
users, jobs, applied = None, None, None

In [3]:
with open("Data/users_df.p", "rb") as f:
    users = pickle.load(f)
with open("Data/jobs_df.p", "rb") as f:
    jobs = pickle.load(f)
with open("Data/applied_df.p", "rb") as f:
    applied = pickle.load(f)

In [5]:
merged = applied.join(users, on="UserID", rsuffix="_").drop(["UserID_"], axis=1)
merged = merged.join(jobs, on="JobID", rsuffix="_").drop(["JobID_"], axis=1)

In [6]:
train, test = train_test_split(merged, test_size=0.1, random_state=0)

In [7]:
train, val = train_test_split(train, test_size=0.1, random_state=0)

In [8]:
X_train, y_train = train.drop(["Applied"], axis=1), train.Applied
X_val, y_val = val.drop(["Applied"], axis=1), val.Applied
X_test, y_test = test.drop(["Applied"], axis=1), test.Applied

In [9]:
X_train_arr = np.array(X_train)[:, 2:]
y_train_arr = np.array(y_train)

In [10]:
X_train_arr.shape[0]

923

In [11]:
X_val_arr = np.array(X_val)[:, 2:]
y_val_arr = np.array(y_val)

In [12]:
X_test_arr = np.array(X_test)[:, 2:]
y_test_arr = np.array(y_test)

In [21]:
resampler = RandomOverSampler(random_state=0)
X_train_arr, y_train_arr = resampler.fit_resample(X_train_arr, y_train_arr)

In [14]:
X_train_arr.shape[0]

940

In [15]:
query_train = [X_train_arr.shape[0]]
query_val = [X_val_arr.shape[0]]
query_test = [X_test_arr.shape[0]]

In [73]:
model = lgb.LGBMRanker(random_state=0, boosting_type="gbdt", learning_rate=0.2)

In [100]:
model.fit(X=X_train_arr, y=y_train_arr, eval_set=[(X_val_arr, y_val_arr)], group=query_train, eval_group=[query_val],
         eval_at=[103, 5, 25], early_stopping_rounds=50)

[1]	valid_0's ndcg@5: 0.66084	valid_0's ndcg@25: 0.518089	valid_0's ndcg@103: 0.824312
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@5: 0.50874	valid_0's ndcg@25: 0.572804	valid_0's ndcg@103: 0.840211
[3]	valid_0's ndcg@5: 0.383566	valid_0's ndcg@25: 0.458429	valid_0's ndcg@103: 0.801628
[4]	valid_0's ndcg@5: 0.529635	valid_0's ndcg@25: 0.504864	valid_0's ndcg@103: 0.815173
[5]	valid_0's ndcg@5: 0.470365	valid_0's ndcg@25: 0.494701	valid_0's ndcg@103: 0.827069
[6]	valid_0's ndcg@5: 0.345191	valid_0's ndcg@25: 0.441304	valid_0's ndcg@103: 0.794196
[7]	valid_0's ndcg@5: 0.485229	valid_0's ndcg@25: 0.487545	valid_0's ndcg@103: 0.821538
[8]	valid_0's ndcg@5: 0.470365	valid_0's ndcg@25: 0.494347	valid_0's ndcg@103: 0.828607
[9]	valid_0's ndcg@5: 0.470365	valid_0's ndcg@25: 0.48723	valid_0's ndcg@103: 0.821605
[10]	valid_0's ndcg@5: 0.485229	valid_0's ndcg@25: 0.491367	valid_0's ndcg@103: 0.824262
[11]	valid_0's ndcg@5: 0.485229	valid_0's ndcg@25: 0.526509	v

[93]	valid_0's ndcg@5: 0.684352	valid_0's ndcg@25: 0.676586	valid_0's ndcg@103: 0.886836
[94]	valid_0's ndcg@5: 0.514771	valid_0's ndcg@25: 0.613828	valid_0's ndcg@103: 0.849682
[95]	valid_0's ndcg@5: 0.654809	valid_0's ndcg@25: 0.687112	valid_0's ndcg@103: 0.877382
[96]	valid_0's ndcg@5: 0.639945	valid_0's ndcg@25: 0.657411	valid_0's ndcg@103: 0.875272
[97]	valid_0's ndcg@5: 0.654809	valid_0's ndcg@25: 0.661637	valid_0's ndcg@103: 0.877537
[98]	valid_0's ndcg@5: 0.786014	valid_0's ndcg@25: 0.667223	valid_0's ndcg@103: 0.880999
[99]	valid_0's ndcg@5: 0.654809	valid_0's ndcg@25: 0.663937	valid_0's ndcg@103: 0.879687
[100]	valid_0's ndcg@5: 0.786014	valid_0's ndcg@25: 0.695868	valid_0's ndcg@103: 0.883384
Did not meet early stopping. Best iteration is:
[57]	valid_0's ndcg@5: 0.868795	valid_0's ndcg@25: 0.653365	valid_0's ndcg@103: 0.889758


LGBMRanker(learning_rate=0.2, random_state=0)

In [101]:
preds = model.predict(X_train_arr)
preds[preds > 0] = 1
preds[preds < 0] = 0
print(f"Train Accuracy: %{(1 - np.sum(np.abs(preds - y_train_arr))/y_train_arr.shape[0]) * 100}")

Train Accuracy: %96.59574468085106


In [102]:
preds = model.predict(X_val_arr)
preds[preds > 0] = 1
preds[preds < 0] = 0
print(f"Validation Accuracy: %{(1 - np.sum(np.abs(preds - y_val_arr))/y_val_arr.shape[0]) * 100}")

Validation Accuracy: %56.310679611650485


Now, we expect our model's accuracy to be in this range. We can combine training and validation sets and get the results for the test set.

In [103]:
X_train_set = np.vstack((X_train_arr, X_val_arr))
y_train_set = np.hstack((y_train_arr, y_val_arr))

In [104]:
model.fit(X=X_train_set, y=y_train_set, group=[len(X_train_set)])
preds = model.predict(X_test_arr)
preds[preds > 0] = 1
preds[preds < 0] = 0
print(f"Test Accuracy: %{(1 - np.sum(np.abs(preds - y_test_arr))/y_test_arr.shape[0]) * 100}")

Test Accuracy: %59.64912280701755


In [106]:
with open("model.p", "wb") as f:
    pickle.dump(model, f)