# Generate top popular jobs for top-N recommendation

Prepare:
- top 15% popular jobs: to reduce time for constructing (UserId, JobID) pairs in recommendation

In [1]:
import pandas as pd
import numpy as np

In [2]:
import pickle

In [3]:
%%time
# Load clean job data
job_set = pd.read_csv("./data_processed/jobset_clean.csv")

CPU times: user 20.4 s, sys: 2.74 s, total: 23.2 s
Wall time: 23.3 s




In [4]:
%%time
# Load the dataset from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

CPU times: user 581 ms, sys: 119 ms, total: 701 ms
Wall time: 706 ms


## Create a list of top popular jobs 
base on cumulative frequency in interaction data

In [5]:
popular_jobs = dataset.copy()
popular_jobs = popular_jobs.groupby(['JobID']).size() \
            .sort_values(ascending=False) \
            .reset_index()
popular_jobs.rename(columns={0:'count_job'}, inplace=True)

In [6]:
popular_jobs.head()

Unnamed: 0,JobID,count_job
0,900797,45
1,1050711,41
2,608463,39
3,601126,37
4,802205,36


In [7]:
total = len(dataset)
popular_jobs['freq'] = popular_jobs.apply(lambda x: x.count_job/total, axis=1)

In [8]:
popular_jobs.head()

Unnamed: 0,JobID,count_job,freq
0,900797,45,7.8e-05
1,1050711,41,7.1e-05
2,608463,39,6.7e-05
3,601126,37,6.4e-05
4,802205,36,6.2e-05


In [9]:
popular_jobs['cum_freq'] = popular_jobs.freq.cumsum()

In [10]:
popular_jobs

Unnamed: 0,JobID,count_job,freq,cum_freq
0,900797,45,0.000078,0.000078
1,1050711,41,0.000071,0.000148
2,608463,39,0.000067,0.000216
3,601126,37,0.000064,0.000279
4,802205,36,0.000062,0.000342
...,...,...,...,...
380560,437042,1,0.000002,0.999993
380561,437039,1,0.000002,0.999995
380562,437037,1,0.000002,0.999997
380563,437034,1,0.000002,0.999998


In [11]:
top15_jobs = popular_jobs[popular_jobs['cum_freq'] <= 0.15]

In [12]:
top15_jobs.to_csv('./data_interim/top15_jobs.csv', header=True, index=False)

In [13]:
len(top15_jobs)

14721