# Discretize numeric features in user profile (for FM and EBM models)
- Input: LDA dataset for users (user_history_lda.csv) and jobs (jobset_lda.csv)
- Output: folder: data_interim
    - users_fm.csv: contain userID and categorical information
    - jobs_fm.csv: contain categorical features of jobs

In [1]:
import caffeine
caffeine.on(display=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

## Discretize features in user data
- Input: user data after lda
- Output: user data with only categorical features


In [3]:
users = pd.read_csv("./data_interim_lda/user_history_lda.csv")

In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152292 entries, 0 to 152291
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   UserID                152292 non-null  int64  
 1   Split                 152292 non-null  object 
 2   City                  152292 non-null  object 
 3   State                 152078 non-null  object 
 4   Country               152292 non-null  object 
 5   DegreeType            152292 non-null  int64  
 6   WorkHistoryCount      152292 non-null  int64  
 7   TotalYearsExperience  152292 non-null  float64
 8   CurrentlyEmployed     152292 non-null  int64  
 9   ManagedOthers         152292 non-null  int64  
 10  ManagedHowMany        152292 non-null  int64  
 11  JobTitle              152292 non-null  object 
 12  WorkHistoryTopic      152292 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 15.1+ MB


In [5]:
users.head()

Unnamed: 0,UserID,Split,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,JobTitle,WorkHistoryTopic
0,80,Train,Williamstown,NJ,US,1,5,11.0,1,1,5,"Auto Publishing/Electro Mechanical Technician,...",9
1,123,Train,Baton Rouge,LA,US,4,1,9.0,1,0,0,Lead Hostess and Takeout Server,0
2,162,Train,Long Beach,CA,US,5,10,25.0,0,0,0,Student AssistantGraduate AssistantTreasurerOw...,0
3,178,Train,Greenville,SC,US,1,6,35.0,0,1,4,CHEMICAL MANAGERChemical Management Company Sp...,0
4,344,Train,Newport News,VA,US,1,3,7.0,1,0,0,Restaurant ServerCashier,0


### Discretize WorkHistoryCount
Using similar scale as RecSys data
- WorkHistoryLevel: 
    - 0 = less than 1 jobs
    - 1 = 1-2 entries
    - 2 = 3-4 entries
    - 3 = 5 or more entries

In [6]:
users['WorkHistoryCount'].describe()

count    152292.000000
mean          4.728600
std           2.255243
min           1.000000
25%           3.000000
50%           4.000000
75%           6.000000
max         120.000000
Name: WorkHistoryCount, dtype: float64

In [7]:
# Discretize: Number of previous jobs 
users['WorkHistoryLevel'] = pd.cut(x = users['WorkHistoryCount'], bins = [0,1,3,5,120], 
                        labels = [0,1,2,3])

In [8]:
users.head()

Unnamed: 0,UserID,Split,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,JobTitle,WorkHistoryTopic,WorkHistoryLevel
0,80,Train,Williamstown,NJ,US,1,5,11.0,1,1,5,"Auto Publishing/Electro Mechanical Technician,...",9,2
1,123,Train,Baton Rouge,LA,US,4,1,9.0,1,0,0,Lead Hostess and Takeout Server,0,0
2,162,Train,Long Beach,CA,US,5,10,25.0,0,0,0,Student AssistantGraduate AssistantTreasurerOw...,0,3
3,178,Train,Greenville,SC,US,1,6,35.0,0,1,4,CHEMICAL MANAGERChemical Management Company Sp...,0,3
4,344,Train,Newport News,VA,US,1,3,7.0,1,0,0,Restaurant ServerCashier,0,1


### Discretize: TotalYearsExperience
Using the similar scale from RecSys data:
- SeniorLevel: based on TotalYearsExperience
    - 0 = less than 1 year
    - 1 = 1-3 years
    - 2 = 3-5 years
    - 3 = 5-10 years
    - 4 = 10-15 years
    - 5 = 16-20 years
    - 6 = more than 20 years


In [9]:
users['TotalYearsExperience'].describe()

count    152292.000000
mean         12.576800
std           8.905711
min           0.000000
25%           6.000000
50%          11.000000
75%          16.000000
max         112.000000
Name: TotalYearsExperience, dtype: float64

In [10]:
# Discretize: TotalYearsExperience
users['SeniorLevel'] = pd.cut(x = users['TotalYearsExperience'], bins = [0,1,3,5,10,15,20,120], 
                        labels = [0,1,2,3,4,5,6])

In [11]:
users.head()

Unnamed: 0,UserID,Split,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,JobTitle,WorkHistoryTopic,WorkHistoryLevel,SeniorLevel
0,80,Train,Williamstown,NJ,US,1,5,11.0,1,1,5,"Auto Publishing/Electro Mechanical Technician,...",9,2,4
1,123,Train,Baton Rouge,LA,US,4,1,9.0,1,0,0,Lead Hostess and Takeout Server,0,0,3
2,162,Train,Long Beach,CA,US,5,10,25.0,0,0,0,Student AssistantGraduate AssistantTreasurerOw...,0,3,6
3,178,Train,Greenville,SC,US,1,6,35.0,0,1,4,CHEMICAL MANAGERChemical Management Company Sp...,0,3,6
4,344,Train,Newport News,VA,US,1,3,7.0,1,0,0,Restaurant ServerCashier,0,1,3


In [12]:
# Check feature "ManagedHowMany": more than 60% are 0 => drop this feature
users.ManagedHowMany.value_counts()

0       107215
5         4603
10        4323
4         3019
3         2934
         ...  
417          1
278          1
241          1
683          1
3258         1
Name: ManagedHowMany, Length: 297, dtype: int64

In [13]:
# drop all unused columns
drop_cols = ['City', 'State', 'Country', 'WorkHistoryCount', 'TotalYearsExperience', 'ManagedHowMany', 'JobTitle']
users.drop(columns=drop_cols, axis=1, inplace=True)

In [14]:
users.head()

Unnamed: 0,UserID,Split,DegreeType,CurrentlyEmployed,ManagedOthers,WorkHistoryTopic,WorkHistoryLevel,SeniorLevel
0,80,Train,1,1,1,9,2,4
1,123,Train,4,1,0,0,0,3
2,162,Train,5,0,0,0,3,6
3,178,Train,1,0,1,0,3,6
4,344,Train,1,1,0,0,1,3


In [15]:
# Export this data for Factorization Machines model
users.to_csv('./data_interim/users_fm.csv', index=False, header=True)

# Discretize features in job data


In [19]:
# Load LDA dataset for job
jobs = pd.read_csv("./data_interim_lda/jobset_lda.csv")

  jobs = pd.read_csv("./cb12-interim/jobset_lda.csv")


In [20]:
jobs.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate,ReqTopic,DescTopic,TitTopic
0,1,1,Security Engineer Technical Lead,Security Clearance Required: Top Secret Job N...,SKILL SET Network Security tools: Webdefend We...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59,5,15,0
1,4,1,SAP Business Analyst WM,NO Corp. to Corp resumes are being considered ...,WHAT YOU NEED: Four year college degreeMinimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59,18,15,0
2,7,1,P T HUMAN RESOURCES ASSISTANT,P T HUMAN RESOURCES ASSISTANT 1-2 ye...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59,15,18,0
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59,15,7,0
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59,15,7,0


In [23]:
drop_cols = ['WindowID', 'Title', 'Description', 
             'Country','Requirements', 'City', 'State', 'Zip5',
            'StartDate', 'EndDate']
jobs.drop(columns=drop_cols, axis=1, inplace=True)

In [24]:
jobs.to_csv('./data_interim/jobs_fm.csv', index=False, header=True)