# Perform the ranking tasks

- randomly select 20 candidate jobs and rank them.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
def show_result(y_true, y_prob):
    y_prediction = [0 if i<=0.5 else 1 for i in y_prob]
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prob))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

In [3]:
user_set = pd.read_csv("user_set_cleaned.csv")
job_set = pd.read_csv("job_set_cleaned.csv")
work_history = pd.read_csv("work_history_cleaned.csv")
dataset = pd.read_csv("dataset_cleaned.csv")

In [4]:
X_train = np.load("X_train.npy")
Y_train = np.load("Y_train.npy")
X_test = np.load("X_test.npy")
Y_test = np.load("Y_test.npy")

# 1. Build datasets

In [5]:
# about 1 min
job_set = job_set.fillna(" ")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=5, max_features=100, stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

In [7]:
word_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

In [8]:
test_user = user_set[user_set.Split=="Test"].UserID.values
test_data = dataset[dataset.UserID.isin(test_user)]

In [32]:
ranking_data = pd.DataFrame(columns = ["UserID","JobID","label", "City", "State"])
job_id = job_set.JobID.unique().tolist()
groups = test_data.groupby("UserID")
user_ids = []
job_ids = []
labels = []
City = []
State = []
for idx, group in tqdm(groups):
    size = 99
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = random.sample(range(0,len(candidate_job)),size)
    user_ids.extend([idx] * (size+1))
    job_ids.append(exist_job[0])
    job_ids.extend([candidate_job[i] for i in sample_job])
    labels.append(1)
    labels.extend([0] * (size))
    City.append(group.City.values[0])
    State.append(group.State.values[0])
    jobs = job_set[job_set.JobID.isin([candidate_job[i] for i in sample_job])]
    
    City.extend([0 if i!=group.City.values[0] else a for i in jobs.City.values.tolist()])
    State.extend([0 if i!=group.State.values[0] else a for i in jobs.State.values.tolist()])
    
ranking_data.UserID = user_ids
ranking_data.JobID = job_ids
ranking_data.label = labels
ranking_data.City = City
ranking_data.State = State
ranking_data.to_csv("ranking_data.csv",index=False)

100%|██████████| 260/260 [00:01<00:00, 171.06it/s]


# 2. Define the evaluation function

In [10]:
def test_hit_rate(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1,158))
        user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index.values
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
        X = np.concatenate((X, feature), axis=0)
        result = model.predict_proba(X[1:])
#         result = model.predict(X[1:])
        a = -np.sort(-result[:,1])
        idx = np.argwhere(a==result[0,1])[0][0]
        if idx <= N-1:
            hit += 1
    return hit/len(test_user)

# 3. Test models
- Random Forest

In [11]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,Y_train)
y_pred = rf.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score   support
0             0    0.6379    0.6452    0.6415       527
1             1    0.6411    0.6338    0.6374       527
2      accuracy                        0.6395      1054
3     macro avg    0.6395    0.6395    0.6395      1054
4  weighted avg    0.6395    0.6395    0.6395      1054
5       overall  0.641075  0.633776  0.637405  0.699677


In [12]:
test_hit_rate(rf,1), test_hit_rate(rf,5), test_hit_rate(rf,10), test_hit_rate(rf,20)

100%|██████████| 260/260 [00:02<00:00, 105.67it/s]
100%|██████████| 260/260 [00:02<00:00, 115.16it/s]
100%|██████████| 260/260 [00:02<00:00, 116.92it/s]
100%|██████████| 260/260 [00:02<00:00, 117.56it/s]


(0.019230769230769232, 0.1, 0.18846153846153846, 0.36923076923076925)

- Linear Regression

In [21]:
def test_hit_rate_linearRegr(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1,158))
        user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index.values
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
        X = np.concatenate((X, feature), axis=0)
#         result = model.predict_proba(X[1:])
        result = model.predict(X[1:])
        a = -np.sort(-result)
        idx = np.argwhere(a==result[0])[0][0]
        if idx <= N-1:
            hit += 1
    return hit/len(test_user)

In [24]:
linear_r = LinearRegression()
print("Linear Regression X_train:{} - Y_train:{}".format(X_train.shape, Y_train.shape))
linear_r.fit(X_train,Y_train)
y_pred = linear_r.predict(X_test)
show_result(Y_test, y_pred)

Linear Regression X_train:(70668, 158) - Y_train:(70668,)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [273]:
test_hit_rate_linearRegr(linear_r,1), test_hit_rate_linearRegr(linear_r,5), test_hit_rate_linearRegr(linear_r,10), test_hit_rate_linearRegr(linear_r, 20)

100%|██████████| 260/260 [00:04<00:00, 64.74it/s]
100%|██████████| 260/260 [00:03<00:00, 65.60it/s]
100%|██████████| 260/260 [00:03<00:00, 68.62it/s]
100%|██████████| 260/260 [00:03<00:00, 67.53it/s]


(0.007692307692307693, 0.1, 0.1576923076923077, 0.28846153846153844)

- Logistic Regression

In [25]:
lr = LogisticRegression()
lr.fit(X_train,Y_train)
y_pred = lr.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [271]:
test_hit_rate(lr,1), test_hit_rate(lr,5), test_hit_rate(lr,10), test_hit_rate(lr,20)

100%|██████████| 260/260 [00:03<00:00, 66.40it/s]
100%|██████████| 260/260 [00:03<00:00, 65.50it/s]
100%|██████████| 260/260 [00:03<00:00, 66.03it/s]
100%|██████████| 260/260 [00:04<00:00, 61.75it/s]


(0.007692307692307693, 0.09230769230769231, 0.1423076923076923, 0.3)

- Decision Tree

In [26]:
dt = DecisionTreeClassifier(max_leaf_nodes=1500,random_state=0)
dt.fit(X_train,Y_train)
y_pred = dt.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5958    0.5427    0.5680       527
1             1    0.5801    0.6319    0.6049       527
2      accuracy                        0.5873      1054
3     macro avg    0.5880    0.5873    0.5865      1054
4  weighted avg    0.5880    0.5873    0.5865      1054
5       overall  0.580139  0.631879  0.604905  0.610791


In [27]:
test_hit_rate(dt,1), test_hit_rate(dt,5), test_hit_rate(dt,10), test_hit_rate(dt,20)

100%|██████████| 260/260 [00:01<00:00, 248.00it/s]
100%|██████████| 260/260 [00:00<00:00, 299.75it/s]
100%|██████████| 260/260 [00:00<00:00, 296.31it/s]
100%|██████████| 260/260 [00:00<00:00, 303.41it/s]


(0.007692307692307693,
 0.06538461538461539,
 0.11538461538461539,
 0.2653846153846154)

- Naive Bayes

In [28]:
nb = GaussianNB()
nb.fit(X_train,Y_train)
y_pred = nb.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

ValueError: Input X contains NaN.
GaussianNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [29]:
test_hit_rate(nb,1), test_hit_rate(nb,5), test_hit_rate(nb,10), test_hit_rate(nb,20)

  0%|          | 0/260 [00:00<?, ?it/s]


AttributeError: 'GaussianNB' object has no attribute 'class_prior_'

- AdaBoost

In [30]:
ada = AdaBoostClassifier(random_state=0)
ada.fit(X_train,Y_train)
y_pred = ada.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

ValueError: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [293]:
test_hit_rate(ada,1), test_hit_rate(ada,5), test_hit_rate(ada,10), test_hit_rate(ada,20)

100%|██████████| 260/260 [00:05<00:00, 44.07it/s]
100%|██████████| 260/260 [00:06<00:00, 38.27it/s]
100%|██████████| 260/260 [00:05<00:00, 43.52it/s]
100%|██████████| 260/260 [00:06<00:00, 40.42it/s]


(0.019230769230769232,
 0.12307692307692308,
 0.21153846153846154,
 0.3269230769230769)

- Gradient Boosting

In [31]:
gbdt = GradientBoostingClassifier(max_depth=10, random_state=0, verbose=1)
gbdt.fit(X_train,Y_train)
y_pred = gbdt.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

ValueError: Input X contains NaN.
GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [285]:
test_hit_rate(gbdt,1),  test_hit_rate(gbdt,5), test_hit_rate(gbdt,10), test_hit_rate(gbdt,20)

100%|██████████| 260/260 [00:04<00:00, 59.66it/s]
100%|██████████| 260/260 [00:04<00:00, 64.58it/s]
100%|██████████| 260/260 [00:03<00:00, 68.54it/s]
100%|██████████| 260/260 [00:03<00:00, 68.10it/s]


(0.038461538461538464, 0.15, 0.23461538461538461, 0.40384615384615385)