In [None]:
### Imports

import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

In [None]:
input_folder = '../archive/'

In [None]:
### @author selfadri
### Load the datasets into pandas dataframes from csv

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
datasets = {}
for dirname, _, filenames in os.walk(input_folder):
    for filename in filenames:
        if filename.split('.')[-1] != 'csv':
            continue
        datasets['_'.join((dirname.split('postings')[-1].replace('/',''),filename.split('.')[-2])).strip(input_folder).lstrip('_')] = pd.read_csv(os.path.join(dirname, filename))

pprint([*datasets.keys()])

In [None]:
### @author selfadri

# Calculate labels, which indicate salary, from any available min-med-max values
untrimmed_y = np.mean(datasets['job_postings'].iloc[:,4:7], axis=1)

# Clean dataset to remove unwanted samples
cond = untrimmed_y.notna() & (datasets['job_postings']['pay_period'] != "ONCE")
y = untrimmed_y[cond]
X = datasets['job_postings'][cond]

# Adjust for the `pay_period`
assert np.all(np.unique(X.iloc[:,7]) == ['HOURLY', 'MONTHLY', 'WEEKLY', 'YEARLY'])
y[X['pay_period'] == "YEARLY"] *= 1
y[X['pay_period'] == "MONTHLY"] *= 12
y[X['pay_period'] == "WEEKLY"] *= 50
y[X['pay_period'] == "HOURLY"] *= 40 * 50

y

In [None]:
### Some samples do not have the salary data needed for our labels.
print(f'{(untrimmed_y.size - y.size) / untrimmed_y.size * 100 :.2f} % of samples are lost due to not having salary :)')
print(f'{y.size} samples remain.')
print('Unfortunately, this is normal for LinkedIn.')
print('Maybe we could make a binary classifier for whether or not the employer would post the salary :)')

In [None]:
### @author selfadri

# Just USD
print(np.unique(datasets['job_details_salaries']['currency']))

# Not here, nothing new to learn from the job_details_salaries csv
print(datasets['job_details_salaries'][datasets['job_details_salaries']['job_id'] == datasets['job_postings'].iloc[2]['job_id']])

# Here's the benefit types recorded
pprint([*np.unique(np.asarray(datasets['job_details_benefits']['type'], str))])

# Here's the work types
pprint([*np.unique(X['work_type'])])

# Here's the titles
print(pd.DataFrame(np.unique(X['title'])))
pprint(str(Counter(X['title']))[:1000])

# Here's the locations
pprint(str(Counter(X['location']))[:1000])

# Only a few job postings have skill description paragraphs
print(y.size - Counter(np.array(X['skills_desc'] ,str))['nan'])

# There are a handful of job types, plenty of each category
print(Counter(datasets['job_details_job_skills'].iloc[:,1]))

### @author selfadri



### Features
We will consider 10 features for now.
- Experience Level
- Job Title (Later, embed this)
- Work Type (full time, part time, intern, etc.)
- Location
- Skills
- Job Industry
- Company Industry
- Company Employee Count
- Benefits
- Company LinkedIn Follower Count
- Remote Work Allowed

### Scope
The location is always in the United States, and the currency is always measured in USD.

### Labels (Salary)
We extracted our labels

In [None]:
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:

# One hot encode the categorical data
enc = OneHotEncoder(sparse_output=False)
enc.fit(X)

In [None]:
enc.transform(X).shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(enc.transform(X), y, test_size=0.2, random_state=42)

In [None]:
svr = svm.SVR()
svr.fit(X_train, y_train)

In [None]:
svr.score(X_test, y_test)