In [4]:
### Imports

import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

In [5]:
input_folder = '../archive/'

In [6]:
### @author selfadri
### Load the datasets into pandas dataframes from csv

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
datasets = {}
for dirname, _, filenames in os.walk(input_folder):
    for filename in filenames:
        if filename.split('.')[-1] != 'csv':
            continue
        datasets['_'.join((dirname.split('postings')[-1].replace('/',''),filename.split('.')[-2])).strip(input_folder).lstrip('_')] = pd.read_csv(os.path.join(dirname, filename))

pprint([*datasets.keys()])

['job_postings',
 'maps_skills',
 'maps_industries',
 'ompany_details_company_industries',
 'ompany_details_company_specialities',
 'ompany_details_companies',
 'ompany_details_employee_counts',
 'job_details_benefits',
 'job_details_salaries',
 'job_details_job_industries',
 'job_details_job_skills']


In [7]:
### @author selfadri

# Calculate labels, which indicate salary, from any available min-med-max values
untrimmed_y = np.mean(datasets['job_postings'].iloc[:,4:7], axis=1)

# Clean dataset to remove unwanted samples
cond = untrimmed_y.notna() & (datasets['job_postings']['pay_period'] != "ONCE")
y = untrimmed_y[cond]
X = datasets['job_postings'][cond]

# Adjust for the `pay_period`
assert np.all(np.unique(X.iloc[:,7]) == ['HOURLY', 'MONTHLY', 'WEEKLY', 'YEARLY'])
y[X['pay_period'] == "YEARLY"] *= 1
y[X['pay_period'] == "MONTHLY"] *= 12
y[X['pay_period'] == "WEEKLY"] *= 50
y[X['pay_period'] == "HOURLY"] *= 40 * 50

y

0         63000.0
3         44540.0
4        240895.0
8         38600.0
11        61000.0
           ...   
33231    100000.0
33233    100000.0
33236     39500.0
33244     48880.0
33245     57500.0
Length: 13351, dtype: float64

In [8]:
### Some samples do not have the salary data needed for our labels.
print(f'{(untrimmed_y.size - y.size) / untrimmed_y.size * 100 :.2f} % of samples are lost due to not having salary :)')
print(f'{y.size} samples remain.')
print('Unfortunately, this is normal for LinkedIn.')
print('Maybe we could make a binary classifier for whether or not the employer would post the salary :)')

59.84 % of samples are lost due to not having salary :)
13351 samples remain.
Unfortunately, this is normal for LinkedIn.
Maybe we could make a binary classifier for whether or not the employer would post the salary :)


In [9]:
### @author selfadri

# Just USD
print(np.unique(datasets['job_details_salaries']['currency']))

# Not here, nothing new to learn from the job_details_salaries csv
print(datasets['job_details_salaries'][datasets['job_details_salaries']['job_id'] == datasets['job_postings'].iloc[2]['job_id']])

# Here's the benefit types recorded
pprint([*np.unique(np.asarray(datasets['job_details_benefits']['type'], str))])

# Here's the work types
pprint([*np.unique(X['work_type'])])

# Here's the titles
print(pd.DataFrame(np.unique(X['title'])))
pprint(str(Counter(X['title']))[:1000])

# Here's the locations
pprint(str(Counter(X['location']))[:1000])

# Only a few job postings have skill description paragraphs
print(y.size - Counter(np.array(X['skills_desc'] ,str))['nan'])

# There are a handful of job types, plenty of each category
print(Counter(datasets['job_details_job_skills'].iloc[:,1]))

['USD']
Empty DataFrame
Columns: [salary_id, job_id, max_salary, med_salary, min_salary, pay_period, currency, compensation_type]
Index: []
['401(k)',
 'Child care support',
 'Commuter benefits',
 'Dental insurance',
 'Disability insurance',
 'Medical insurance',
 'Paid maternity leave',
 'Paid paternity leave',
 'Pension plan',
 'Student loan assistance',
 'Tuition assistance',
 'Vision insurance']
['CONTRACT', 'FULL_TIME', 'INTERNSHIP', 'OTHER', 'PART_TIME', 'TEMPORARY']
                                                      0
0                   Research Associate II, Pathobiology
1                                      Account Manager 
2      Airport Senior Project Manager – Large Termin...
3                               Applications Developer 
4         Associate Director Marketing Asset Management
...                                                 ...
9414  in-house Contracting Counsel (junior-mid level...
9415                 oracle SCM TECHNO CLOUD FUNCTIONAL
9416              

### Features
We will consider 10 features for now.
- Experience Level
- Job Title (Later, embed this)
- Work Type (full time, part time, intern, etc.)
- Location
- Skills
- Job Industry
- Company Industry
- Company Employee Count
- Benefits
- Company LinkedIn Follower Count
- Remote Work Allowed

### Scope
The location is always in the United States, and the currency is always measured in USD.

### Labels (Salary)
We extracted our labels

In [24]:
X_trim=X.copy()
X_trim=X_trim.drop(columns=['job_id', 'description', 'max_salary', 'med_salary',
                  'min_salary', 'pay_period', 'applies', 'original_listed_time',
                  'views', 'job_posting_url', 'application_url', 'application_type',
                  'expiry', 'closed_time', 'listed_time','posting_domain', 'sponsored', 'currency',
                   'compensation_type', 'scraped'])
X_trim.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13351 entries, 0 to 33245
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   company_id                  13129 non-null  float64
 1   title                       13351 non-null  object 
 2   formatted_work_type         13351 non-null  object 
 3   location                    13351 non-null  object 
 4   remote_allowed              2098 non-null   float64
 5   formatted_experience_level  9789 non-null   object 
 6   skills_desc                 105 non-null    object 
 7   work_type                   13351 non-null  object 
dtypes: float64(2), object(6)
memory usage: 938.7+ KB


In [25]:
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [26]:
enc = OneHotEncoder(sparse_output=False)
enc.fit(X_trim)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(enc.transform(X_trim), y, test_size=0.2, random_state=42)
X_train.shape

(10680, 17230)

In [None]:
svr = svm.SVR()
svr.fit(X_train, y_train)
svr.score(X_test, y_test)
