In [1]:
import os
import pandas as pd
import numpy as np

In [55]:
class Job():
    def __init__(self, job_id, name, company, desc):
        self.job_id = job_id
        self.name = name
        self.company = company
        self.desc = desc
        self.cats = None
    
    def set_cats(self, cats):
        self.cats = cats
        
    def __str__(self):
        return self.name

In [57]:
jobs = []
for f_name in os.listdir("jobs"):
    with open("jobs/" + f_name, "r") as f:
        job_id = len(jobs)
        name = f.readline().rstrip("\n")
        company = f.readline().rstrip("\n")
        cats = f.readline().rstrip("\n")
        desc = f.read().replace("\n", " ")
        job = Job(job_id, name, company, desc)
        cat_l = np.zeros(9)
        if "Machine Learning" in cats:
            cat_l[0] = 1
        if "Computer Vision" in cats:
            cat_l[1] = 1
        if "Computer Security" in cats:
            cat_l[2] = 1
        if "Software Engineering" in cats:
            cat_l[3] = 1
        if "Algorithms" in cats:
            cat_l[4] = 1
        if "Statistics" in cats:
            cat_l[5] = 1
        if "Web Development" in cats:
            cat_l[6] = 1
        if "Systems Programming" in cats:
            cat_l[7] = 1
        if "Computer Communications" in cats:
            cat_l[8] = 1
        job.set_cats(cat_l)
        jobs.append(job)

In [58]:
class User():
    def __init__(self, user_id):
        self.user_id = user_id
        self.skills = np.random.randint(0, 2, size=(9))
        self.degree = np.random.randint(0, 2)
        self.majors = np.random.randint(0, 2)
        self.uni = np.random.randint(0, 3)
        self.exp = np.random.randint(0, 11)
        
    def calculate_apply(self, job_cat):
        p =  1 - np.abs(job_cat - self.skills).mean()
        return np.random.binomial(n=1, p=p)

In [59]:
users = []
for i in range(30):
    users.append(User(i))

In [65]:
applied = np.zeros(shape=(38 * 30, 3))
i = 0
for user in users:
    for job in jobs:
        applied[i, :] = np.array([user.user_id, job.job_id, user.calculate_apply(job.cats)])
        i += 1

In [70]:
applied_df = pd.DataFrame(data=applied, columns=["UserID", "JobID", "Applied"])

In [86]:
applied_df.groupby("UserID").Applied.sum()

UserID
0.0     23.0
1.0     21.0
2.0     26.0
3.0     16.0
4.0     24.0
5.0     23.0
6.0     21.0
7.0     22.0
8.0     17.0
9.0     16.0
10.0    24.0
11.0    19.0
12.0    17.0
13.0    16.0
14.0     7.0
15.0    13.0
16.0    21.0
17.0    16.0
18.0    22.0
19.0    19.0
20.0    22.0
21.0    23.0
22.0    19.0
23.0    18.0
24.0     9.0
25.0    19.0
26.0    15.0
27.0    17.0
28.0    18.0
29.0    21.0
Name: Applied, dtype: float64

In [87]:
import pickle

In [90]:
users_arr = np.zeros(shape=(30, 14))
for i in range(30):
    users_arr[i, 0] = users[i].user_id
    users_arr[i, 1:10] = users[i].skills
    users_arr[i, 10] = users[i].degree
    users_arr[i, 11] = users[i].majors
    users_arr[i, 12] = users[i].uni
    users_arr[i, 13] = users[i].exp

In [133]:
users_df = pd.DataFrame(data=users_arr, columns=["UserID"] + ["Skill_" + str(i) for i in range(9)] +\
                       ["Degree", "Majors", "University", "Experience"])

In [135]:
with open("users_df.p", "wb") as f:
    pickle.dump(users_df, f)

In [94]:
with open("users_arr.p", "wb") as f:
    pickle.dump(users_arr, f)
    
with open("applied_df.p", "wb") as f:
    pickle.dump(applied_df, f)

In [96]:
with open("jobs_pylist.p", "wb") as f:
    pickle.dump(jobs, f)
    
with open("users_pylist.p", "wb") as f:
    pickle.dump(users, f)

In [103]:
embeddings = {}
with open("wordvec/glove.6B.50d.txt", "r") as f:
    for line in f:
        l = line.split()
        embeddings[l[0]] = np.array(l[1:], dtype=np.float)

In [105]:
with open("embeddings.p", "wb") as f:
    pickle.dump(embeddings, f)

In [108]:
import string

In [109]:
def doc2vec(inp, embeddings):
    """
    Inputs:
    @inp: Input string to be converted into vector
    @embeddings: Dictionary keeping all the embeddings for vocabulary
    
    Outputs:
    Returns normalized embedding vector for the given inp
    """
    inp = inp.translate(str.maketrans('', '', string.punctuation))
    inp = inp.lower()
    inp_val = np.zeros((50,), dtype=np.float64)
    inp_len = len(inp.split())
    for w in inp.split():
        inp_val += embeddings.get(w, inp_val)
    return inp_val / inp_len

In [125]:
job_features_arr = np.zeros((38, 101))

In [126]:
for i, job in enumerate(jobs):
    job_features_arr[i, 0] = job.job_id
    job_features_arr[i, 1:51] = doc2vec(job.name, embeddings)
    job_features_arr[i, 51:] = doc2vec(job.desc, embeddings)

In [128]:
job_features_df = pd.DataFrame(data=job_features_arr, columns=["JobID"] + ["em_" + str(i) for i in range(100)])

In [132]:
with open("job_features_df.p", "wb") as f:
    pickle.dump(job_features_df, f)