# Feature engineering: TF-IDF, Location matching

- Source: [Repository PJFNN](https://github.com/doslim/Job-Recommendation-PJFNN) 
- Input: Pre-processed data in step 3 
    - user_set_cleaned.csv (label encoded)
    - work_history_cleaned.csv
    - dataset_cleaned.csv (with 3 binary columns for matching city, state, country)
    
- Summary:
    - TF-IDF on jobs data (Corpus: job title + description + requirement)
    - TF-IDF on work_history data (Corpus: job titles linked to one user) => work_history_cleaned.csv
    - Construct matric vector (combining user profile + TFIDF user history + jobs TFIDF + binary matching columns) 
    
- Output:
    - X_train_159.npy
    - Y_train_159.npy
    - X_test.npy


In [1]:
import caffeine
caffeine.on(display=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [3]:
# Load pre-processed data
path = "./data_processed/"
user_set = pd.read_csv(path +"users_clean.csv")
job_set = pd.read_csv(path + "jobset_clean.csv")
work_history = pd.read_csv(path + "work_history.csv") 
dataset = pd.read_csv(path + "dataset.csv") #negative sampling application data

  job_set = pd.read_csv(path + "jobset_clean.csv")


In [4]:
user_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258260 entries, 0 to 258259
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   UserID                258260 non-null  int64  
 1   WindowID              258260 non-null  int64  
 2   Split                 258260 non-null  object 
 3   City                  258260 non-null  object 
 4   State                 257917 non-null  object 
 5   Country               258260 non-null  object 
 6   ZipCode               257094 non-null  object 
 7   DegreeType            258260 non-null  object 
 8   Major                 258260 non-null  object 
 9   GraduationDate        203034 non-null  object 
 10  WorkHistoryCount      258260 non-null  int64  
 11  TotalYearsExperience  258260 non-null  float64
 12  CurrentlyEmployed     258260 non-null  object 
 13  ManagedOthers         258260 non-null  object 
 14  ManagedHowMany        258260 non-null  int64  
dtype

In [5]:
job_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050509 entries, 0 to 1050508
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   JobID         1050509 non-null  int64 
 1   WindowID      1050509 non-null  int64 
 2   Title         1050509 non-null  object
 3   Description   1050289 non-null  object
 4   Requirements  1047776 non-null  object
 5   City          1050507 non-null  object
 6   State         1050508 non-null  object
 7   Country       1050506 non-null  object
 8   Zip5          658844 non-null   object
 9   StartDate     1050509 non-null  object
 10  EndDate       1050500 non-null  object
dtypes: int64(2), object(9)
memory usage: 88.2+ MB


In [6]:
work_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144557 entries, 0 to 1144556
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   UserID    1144557 non-null  int64 
 1   WindowID  1144557 non-null  int64 
 2   Split     1144557 non-null  object
 3   Sequence  1144557 non-null  int64 
 4   JobTitle  1144557 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.7+ MB


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2081340 entries, 0 to 2081339
Data columns (total 3 columns):
 #   Column  Dtype
---  ------  -----
 0   UserID  int64
 1   JobID   int64
 2   label   int64
dtypes: int64(3)
memory usage: 47.6 MB


## 1. TF-IDF on job details
- Job detail corpus: job_set["word"]: combine job Title, Description, and Requirement

In [4]:
# Create corpus by joining job Title, Description, and Requirement
job_set = job_set.fillna(" ")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements

In [5]:
job_set.word.head()

0    Security Engineer Technical LeadSecurity Clear...
1    SAP Business Analyst   WMNO Corp. to Corp resu...
2    P T HUMAN RESOURCES ASSISTANT     P T HUMAN RE...
3    Route Delivery DriversCITY BEVERAGES Come to w...
4    HousekeepingI make  sure every part of their d...
Name: word, dtype: object

In [6]:
import time
start = time.time()
    
# TF-IDF on the combination of Title & Description & Requirements
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=5, 
                     max_features=100, 
                     stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

end = time.time()
print('Computing time: ', end - start)

Computing time:  574.4752089977264


In [7]:
# Export this matrix
from scipy import sparse

sparse.save_npz("./data_interim_tfidf/tfidf_matrix.npz", tfidf_matrix)

In [8]:
tfidf_matrix

<1050509x100 sparse matrix of type '<class 'numpy.float64'>'
	with 31754477 stored elements in Compressed Sparse Row format>

In [9]:
# Check loaded matrix
your_matrix_back = sparse.load_npz("./data_interim_tfidf/tfidf_matrix.npz")

In [10]:
your_matrix_back

<1050509x100 sparse matrix of type '<class 'numpy.float64'>'
	with 31754477 stored elements in Compressed Sparse Row format>

## 2. Clean user data and TF-IDF on user history
Filter out users with more than 10 applications

In [11]:
# Input: application data after negative sampling
temp = sorted(dict(dataset.UserID.value_counts()).items(), key=lambda x: x[1], reverse=True)
exclude_user_id = [i[0] for i in temp if i [1]>=10]
len(exclude_user_id)

56661

In [12]:
# check value of temp (user_id, #applications)
temp

[(1127206, 2186),
 (1382297, 1964),
 (39567, 1244),
 (991504, 998),
 (33263, 748),
 (948207, 684),
 (621858, 682),
 (466070, 660),
 (179511, 654),
 (1407339, 622),
 (978655, 588),
 (863767, 574),
 (874238, 568),
 (853630, 564),
 (904557, 556),
 (379548, 544),
 (196770, 542),
 (402913, 518),
 (239403, 508),
 (422035, 486),
 (592989, 478),
 (1212335, 476),
 (124421, 476),
 (879484, 474),
 (393136, 470),
 (689323, 470),
 (987700, 468),
 (1377241, 462),
 (851263, 458),
 (1312055, 458),
 (986466, 454),
 (693722, 450),
 (779720, 444),
 (936020, 442),
 (412328, 440),
 (144296, 430),
 (885703, 426),
 (284621, 422),
 (1448156, 420),
 (1275839, 414),
 (523805, 406),
 (693755, 404),
 (554592, 400),
 (1224627, 398),
 (770351, 398),
 (694972, 396),
 (198051, 386),
 (1171425, 384),
 (1302531, 378),
 (480969, 372),
 (86625, 372),
 (211877, 370),
 (1372608, 366),
 (1162218, 366),
 (552608, 360),
 (1080768, 358),
 (985258, 358),
 (921822, 354),
 (1070463, 352),
 (610663, 352),
 (1259366, 348),
 (656865

In [13]:
# Drop application where user_id is in exclude list
dataset = dataset[~dataset.UserID.isin(exclude_user_id)] 

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 579626 entries, 0 to 2081339
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   UserID  579626 non-null  int64
 1   JobID   579626 non-null  int64
 2   label   579626 non-null  int64
dtypes: int64(3)
memory usage: 17.7 MB


### select data in ```work_history, user_set```

In [15]:
user_id = dataset.UserID.unique()
work_history = work_history[work_history.UserID.isin(user_id)]
user_set = user_set[user_set.UserID.isin(user_id)]
user_set.reset_index(drop=True, inplace=True)

### drop duplicates in ```work_history```

In [16]:
work_history.duplicated(subset=['Sequence']).value_counts()

True     683435
False       119
dtype: int64

In [17]:
work_history = work_history.drop(columns=["Sequence"]).drop_duplicates()

In [18]:
work_history.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 628173 entries, 8 to 1144555
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   UserID    628173 non-null  int64 
 1   WindowID  628173 non-null  int64 
 2   Split     628173 non-null  object
 3   JobTitle  628173 non-null  object
dtypes: int64(2), object(2)
memory usage: 24.0+ MB


In [19]:
word_history_tf = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 2),
                                  min_df=0, 
                                  max_features=50, 
                                  stop_words='english')

In [20]:
# Get all the title that one user have applied
work_history.groupby("UserID").JobTitle.sum().values

array(['Front End Supervisor/CashierCustomer Service Representative',
       'CookMechanicFile Clerk',
       'Pennsylvania MentorStudent WorkerInternship in Adoption UnitStudent Worker - Continuing EducationSales Associate',
       ...,
       'Front Office ManagerAt Your Service SupervisorFront Desk SupervisorOperations SupervisorReservations CoordinatorFront Desk Agent',
       'Preschool TeacherSalesCustomer Service',
       'ElectricianTechnicianSupervisorElectrician (trainee)'],
      dtype=object)

In [21]:
# Apply TF-IDF on all previous job title of a user
word_history_tf_matrix = word_history_tf \
                        .fit_transform(work_history \
                                       .groupby("UserID") \
                                       .JobTitle.sum().values)

In [22]:
word_history_tf_matrix

<152292x50 sparse matrix of type '<class 'numpy.float64'>'
	with 327656 stored elements in Compressed Sparse Row format>

In [26]:
# Export this matrix
from scipy import sparse
sparse.save_npz("./data_interim_tfidf/work_history_tf_matrix.npz", word_history_tf_matrix)

In [27]:
# Check loaded matrix
your_matrix_back = sparse.load_npz("./data_interim_tfidf/work_history_tf_matrix.npz")

In [28]:
your_matrix_back

<152292x50 sparse matrix of type '<class 'numpy.float64'>'
	with 327656 stored elements in Compressed Sparse Row format>

## 3. Encode categorical data for user profile

In [23]:
user_set.info() # 152,292

In [24]:
user_set.Country.value_counts()

In [25]:
user_set[user_set['Country'] != "US"].count()

In [26]:
# Change compare to original repository: do not filter only for user in the US 
# => Keep column "Country"
user_set = user_set.drop(columns=["ZipCode","Major","GraduationDate","WindowID"])

In ```user_set``` 
- label encoding for ```DegreeType```
- one-hot encoding for ```State```
- binary labels for Currently ```Employed/ManagedOthers```

In [27]:
# user_set = pd.get_dummies(user_set, columns=["State"])
user_set.replace({"CurrentlyEmployed":{"Yes":1,"No":0}}, inplace=True)
user_set.replace({"ManagedOthers":{"Yes":1,"No":0}}, inplace=True)
user_set.replace({"DegreeType":{"None":0,"High School":1, "Vocational":2, "Associate's":3, "Bachelor's":4, "Master's":5, "PhD":6}}, 
                 inplace=True)

In [28]:
user_set

## 4. Create feature for location matching
Add binary columns in "dataset" to match city, state, and country between users and jobs

- add binary labels into the dataset, indicating that whether the user and job are in the same country/city/state. (add country match)

### Try using User defined function and apply on dataset (lambda)

In [29]:
def get_city_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['City'] == job['City'])

In [30]:
def get_state_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['State'] == job['State'])

In [31]:
def get_country_match(user_id, job_id):
    user = user_set[user_set['UserID'] == user_id].iloc[0]
    job = job_set[job_set['JobID'] == job_id].iloc[0]
    return float(user['Country'] == job['Country'])

In [32]:
import time
start = time.time()
dataset['City'] = dataset.apply(lambda x: get_city_match(x.UserID, x.JobID), axis=1)
end = time.time()
print('Computing time: ', end - start)

In [33]:
import time
start = time.time()
dataset['State'] = dataset.apply(lambda x: get_state_match(x.UserID, x.JobID), axis=1)
end = time.time()
print('Computing time: ', end - start)

In [34]:
import time
start = time.time()
dataset['Country'] = dataset.apply(lambda x: get_country_match(x.UserID, x.JobID), axis=1)
end = time.time()
print('Computing time: ', end - start)

In [35]:
# Check output
dataset.head()

## Export data

In [36]:
job_set.info()

In [37]:
job_set.head()

In [38]:
dataset.drop_duplicates(inplace=True, ignore_index=False)

In [39]:
user_set.to_csv("./data_interim/user_set_cleaned.csv", index=False)
dataset.to_csv("./data_interim/dataset_cleaned.csv", index=False)
work_history.to_csv("./data_interim/work_history_cleaned.csv", index=False)

## 4. Build interaction data as vectors (WARNING: LONG TIME COMPUTATION)
Build the training set and testing set combining all information:
- User profile
- Job details (TF-IDF)
- User history (TF-IDF)
- Location matching

In [40]:
# Load the datasetp from Step 3
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
dataset = pd.read_csv("./data_interim/dataset_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

In [41]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [42]:
groups = train_data.groupby("UserID")

In [43]:
groups.head()

Unnamed: 0,UserID,JobID,label,City,State,Country
0,7,309823,1,1.0,1.0,1.0
1,7,703889,1,1.0,1.0,1.0
2,7,566574,0,0.0,0.0,1.0
3,7,481216,0,0.0,0.0,1.0
4,9,809208,1,1.0,1.0,1.0
...,...,...,...,...,...,...
579620,1472085,209482,0,0.0,0.0,1.0
579621,1472090,209535,1,1.0,1.0,1.0
579622,1472090,254881,1,1.0,1.0,1.0
579623,1472090,999294,0,0.0,0.0,1.0


In [44]:
dataset.duplicated().any().sum()

0

## Start building vectors
2 versions:
- including Country match (vector size: 159)
- excluding Country match (vector size: 158, similar to PJFNN)

### Version with Country match (vector size: 159)

In [45]:
# Version with Country match (vector size: 159)
groups = train_data.groupby("UserID")
X_train = np.zeros((1,159))
Y_train = []
for u_id, group in tqdm(groups):
#     print('u_id:', u_id)
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", 
                                            "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
        
#     print('combine user_feature and embeded job', len(f))
#     print('shape of vector f:', np.array(f).shape)
    
#     print('group[["City","State", "Country"]].values', group[["City","State", "Country"]].values.shape)
    
    feature = np.concatenate((group[["City","State","Country"]].values, np.array(f)),axis=1)
    
#     print('Shape of feature vector: ', feature.shape)
#     print('Shape of X_train: ', X_train.shape)
    
    X_train = np.concatenate((X_train, feature), axis=0)
    Y_train.extend(group.label.values.tolist())

100%|█████████████████████████████████| 148576/148576 [9:58:30<00:00,  4.14it/s]


In [46]:
X_train.shape, len(Y_train)

((563890, 159), 563889)

In [47]:
# Test data: Version with Country match (vector size: 159)
groups = test_data.groupby("UserID")
X_test = np.zeros((1,159))
Y_test = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", 
                                            "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State", "Country"]].values, np.array(f)),axis=1)
    X_test = np.concatenate((X_test, feature), axis=0)
    Y_test.extend(group.label.values.tolist())

100%|███████████████████████████████████████| 3716/3716 [00:48<00:00, 76.02it/s]


In [48]:
X_test.shape, len(Y_test)

((15737, 159), 15736)

In [49]:
# Export train and test dataset to folder data_interim_tfidf
np.save("./data_interim_tfidf/X_train_159.npy",X_train[1:,])
np.save("./data_interim_tfidf/Y_train_159.npy",np.array(Y_train))
np.save("./data_interim_tfidf/X_test_159.npy",X_test[1:,])
np.save("./data_interim_tfidf/Y_test_159.npy",np.array(Y_test))

## Export all vectorizer model to reuse

In [51]:
import pickle

# For jobs: tf
tf_path = './data_interim_tfidf/job_tf.pickle'
pickle.dump(tf, open(tf_path, "wb"))

# For work_history: word_history_tf
work_history_tf_path = './data_interim_tfidf/work_history_tf.pickle'
pickle.dump(word_history_tf, open(work_history_tf_path, "wb"))


In [52]:
tfidf_matrix # jobs

<1050509x100 sparse matrix of type '<class 'numpy.float64'>'
	with 31754477 stored elements in Compressed Sparse Row format>

In [53]:
word_history_tf_matrix # work history

<152292x50 sparse matrix of type '<class 'numpy.float64'>'
	with 327656 stored elements in Compressed Sparse Row format>