In [1]:
import pandas as pd

In [2]:
indeed = pd.read_csv('indeed_results.csv')

In [3]:
def categorise(x):
    if x <= indeed.salary_avg_USD.quantile(0.25):
        return '0_25'
    elif x > indeed.salary_avg_USD.quantile(0.25) and x <= indeed.salary_avg_USD.quantile(0.5):
        return '25_50'
    elif x > indeed.salary_avg_USD.quantile(0.5) and x <= indeed.salary_avg_USD.quantile(0.75):
        return '50_75'
    elif x > indeed.salary_avg_USD.quantile(0.75):
        return '75_100'
    else:
        return np.nan

indeed.insert(12, 'target', indeed['salary_avg_USD'].apply(categorise))

### Create a classification model to predict High/Low salary. 


- Start by ONLY using the location as a feature.
- Use at least two different classifiers you find suitable.
- Remember that scaling your features might be necessary.
- Display the coefficients/feature importances and write a short summary of what they mean.
- Create a few new variables in your dataframe to represent interesting features of a job title (e.g. whether 'Senior' or 'Manager' is in the title).
- Incorporate other text features from the title or summary that you believe will predict the salary.
- Then build new classification models including also those features. Do they add any value?
- Tune your models by testing parameter ranges, regularization strengths, etc. Discuss how that affects your models.
- Discuss model coefficients or feature importances as applicable.

In [4]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# set X,y
X = indeed['search_city']
y = indeed['target']

# dummify X
X_dum = pd.get_dummies(X, drop_first=True)

# scale
scaler = StandardScaler()
Xs = scaler.fit_transform(X_dum)

# GridSearchCV with LRCV:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
lrcv = LogisticRegressionCV(solver='liblinear', multi_class='ovr', cv=kf, max_iter=10000)
lrcv_params = {'penalty': ['l1', 'l2'],
              'Cs':[np.logspace(-30,-15,10)]}
gs_lrcv = GridSearchCV(lrcv, lrcv_params, cv=kf, n_jobs=2, verbose=1)
gs_lrcv.fit(Xs, y)
print('Best GS LRCV params:',gs_lrcv.best_params_)
best_gs_lrcv = gs_lrcv.best_estimator_
print('Best GS LRCV C_:',best_gs_lrcv.C_)
print('Best Training LRCVScore:', gs_lrcv.best_score_)

# GridSearchCV with KNN:
knn = KNeighborsClassifier(n_jobs=2)
knn_params = {'n_neighbors': [3,5,7,9,11,21,41,51],
             'weights': ['uniform', 'distance'],
             'metric': ['euclidean', 'manhattan']}
gs_knn = GridSearchCV(knn, knn_params, cv=kf, n_jobs=2, verbose=1)
gs_knn.fit(Xs,y)
print('Best GS KNN params:', gs_knn.best_params_)
best_gs_knn = gs_knn.best_estimator_
print('Best Training KNN CVScore:', gs_knn.best_score_)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    4.2s finished


Best GS LRCV params: {'Cs': array([1.00000000e-30, 4.64158883e-29, 2.15443469e-27, 1.00000000e-25,
       4.64158883e-24, 2.15443469e-22, 1.00000000e-20, 4.64158883e-19,
       2.15443469e-17, 1.00000000e-15]), 'penalty': 'l2'}
Best GS LRCV C_: [1.00000000e-30 2.15443469e-17 1.00000000e-30 2.15443469e-17]
Best Training LRCVScore: 0.2797573734001113
Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    3.3s


Best GS KNN params: {'metric': 'euclidean', 'n_neighbors': 51, 'weights': 'distance'}
Best Training KNN CVScore: 0.33644741235392317


[Parallel(n_jobs=2)]: Done 160 out of 160 | elapsed:   11.4s finished


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cvec = CountVectorizer(stop_words='english', token_pattern='[A-Za-z]+')
cvec.fit(indeed['title'])

# Lets check the length of our data that is in a vectorized state
len(cvec.get_feature_names())

# Transform training data
cvec.transform(indeed['title'])
features_df = pd.DataFrame(data=cvec.transform(indeed['title']).sum(axis=0),columns=cvec.get_feature_names())
features_df

Unnamed: 0,abap,abatement,academic,academy,access,account,accountant,accounting,accounts,ackg,...,workday,workflow,workforce,working,workplace,writer,x,year,years,zoho
0,1,2,1,1,2,43,11,2,4,1,...,3,1,2,2,3,2,1,5,5,1


In [6]:
cvec.get_feature_names()

['abap',
 'abatement',
 'academic',
 'academy',
 'access',
 'account',
 'accountant',
 'accounting',
 'accounts',
 'ackg',
 'activity',
 'actuary',
 'acute',
 'ad',
 'admin',
 'administration',
 'administrative',
 'administrator',
 'admissions',
 'adobe',
 'adoption',
 'adtech',
 'advanced',
 'advertising',
 'advisor',
 'advisory',
 'adwords',
 'affairs',
 'afrikaans',
 'agency',
 'agent',
 'agile',
 'ai',
 'air',
 'algorithm',
 'algorithms',
 'alliance',
 'alteryx',
 'amazon',
 'american',
 'aml',
 'ams',
 'analysis',
 'analyst',
 'analystics',
 'analysts',
 'analytic',
 'analytics',
 'angular',
 'angularjs',
 'antislavery',
 'ap',
 'api',
 'app',
 'application',
 'applications',
 'applied',
 'apprentice',
 'apprenticeship',
 'approach',
 'approvals',
 'apps',
 'architect',
 'architecture',
 'area',
 'arena',
 'arti',
 'artificia',
 'artificial',
 'arts',
 'assessor',
 'asset',
 'assistant',
 'assistants',
 'associate',
 'associates',
 'assurance',
 'attorney',
 'attorneys',
 'audienc

In [39]:
features_df.T.sort_values(0,ascending=False)

Unnamed: 0,0
data,263
scientist,156
analyst,61
senior,54
engineer,46
learning,29
science,28
machine,27
lead,23
statistician,21


In [56]:
X = cvec.transform(indeed['title'])
y = indeed.target

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
print(cross_val_score(lr,X,y,cv=kf).mean())
lr.fit(X,y)
pd.DataFrame(dict(coef=lr.coef_[0], features=cvec.get_feature_names())).sort_values('coef',ascending=False).head(10)

0.60251758607923


Unnamed: 0,coef,features
140,2.280853,junior
115,1.962096,graduate
5,1.312997,analyst
199,1.26864,processing
169,1.143872,medical
216,1.101223,research
12,0.791815,associate
79,0.757162,economist
78,0.757162,economics
69,0.735808,development


In [57]:
pd.DataFrame(dict(coef=lr.coef_[0], features=cvec.get_feature_names())).sort_values('coef',ascending=False).tail(10)

Unnamed: 0,coef,features
64,-0.848776,demographer
120,-0.860431,head
3,-0.892901,ai
22,-0.922054,big
101,-0.971389,fintech
217,-1.00298,researcher
249,-1.129191,sr
207,-1.144819,quant
147,-1.297855,lead
227,-1.323485,science


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(stop_words='english', token_pattern='[A-Za-z]+')
tvec.fit(indeed['title'])

features_df = pd.DataFrame(data=tvec.transform(indeed['title']).sum(axis=0),columns=tvec.get_feature_names())
features_df

Unnamed: 0,abap,abatement,academic,academy,access,account,accountant,accounting,accounts,ackg,...,workday,workflow,workforce,working,workplace,writer,x,year,years,zoho
0,0.691032,1.550656,0.551581,0.789963,0.693609,27.02706,7.38456,1.258637,2.20865,0.746929,...,1.794249,0.564229,1.178876,0.964438,1.866761,1.203654,0.490107,2.03805,3.12685,0.746254


In [72]:
features_df.T.sort_values(0,ascending=False)

Unnamed: 0,0
data,72.917347
scientist,65.371810
analyst,25.044565
senior,24.267396
engineer,18.563478
lead,15.277072
science,14.203877
learning,11.747049
statistician,11.097278
junior,11.016916


In [75]:
X = tvec.transform(indeed['title'])
y = indeed.target

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
print(cross_val_score(lr,X,y,cv=kf).mean())
lr.fit(X,y)
pd.DataFrame(dict(coef=lr.coef_[0], features=tvec.get_feature_names())).sort_values('coef',ascending=False).head(20)

0.5806368011847465


Unnamed: 0,coef,features
140,2.448364,junior
5,2.034652,analyst
115,1.974762,graduate
216,1.629114,research
199,1.376908,processing
169,1.103129,medical
12,0.890519,associate
186,0.716999,operations
78,0.690302,economics
79,0.690302,economist


In [76]:
pd.DataFrame(dict(coef=lr.coef_[0], features=tvec.get_feature_names())).sort_values('coef',ascending=False).tail(20)

Unnamed: 0,coef,features
171,-0.508795,ml
122,-0.55616,healthcare
156,-0.576932,m
84,-0.615134,engineer
180,-0.65895,nlp
241,-0.681899,software
158,-0.687112,machine
64,-0.732901,demographer
101,-0.766099,fintech
201,-0.778635,product


In [77]:
# features to create: head, lead, manager

In [102]:
indeed.head(1)

Unnamed: 0,title,company,rating,search_city,location,salary,salary_freq,isRange,salary_range_min,salary_range_max,salary_avg,salary_avg_USD,target,age,summary
0,Data Scientist,TheMathCompany Inc.,,Chicago,"Chicago, IL","$80,000 - $110,000 a year",year,Range,80000,110000,95000.0,95000.0,50_75,10 days ago,We are looking for passionate individuals to h...


In [111]:
indeed.insert(1, 'isDS', indeed['title'].apply(lambda x: 1 if 'data science' in x.lower() or 'data scientist' in x.lower() else 0))
indeed.insert(2, 'isDA', indeed['title'].apply(lambda x: 1 if 'data analyst' in x.lower() or 'analytics' in x.lower() else 0))
indeed.insert(3, 'isDE', indeed['title'].apply(lambda x: 1 if 'data engineer' in x.lower() else 0))
indeed.insert(4, 'isML', indeed['title'].apply(lambda x: 1 if 'ml' in x.lower() or 'machine learning' in x.lower() else 0))
indeed.insert(5, 'isAI', indeed['title'].apply(lambda x: 1 if 'ai' in x.lower() or 'artifical intelligence' in x.lower() else 0))
indeed.insert(6, 'isBI', indeed['title'].apply(lambda x: 1 if 'bi' in x.lower() or 'business intelligence' in x.lower() else 0))

indeed.insert(7, 'isHead', indeed['title'].apply(lambda x: 1 if 'head' in x.lower() else 0))
indeed.insert(8, 'isLead', indeed['title'].apply(lambda x: 1 if 'lead' in x.lower() else 0))
indeed.insert(9, 'isManager', indeed['title'].apply(lambda x: 1 if 'mgr' in x.lower() or 'manager' in x.lower() else 0))
indeed.insert(10, 'isSenior', indeed['title'].apply(lambda x: 1 if 'sr' in x.lower() or 'senior' in x.lower() else 0))
indeed.insert(11, 'isJunior', indeed['title'].apply(lambda x: 1 if 'jr' in x.lower() or 'junior' in x.lower() else 0))
indeed.insert(12, 'isGraduate', indeed['title'].apply(lambda x: 1 if 'grad' in x.lower() or 'graduate' in x.lower() else 0))

indeed.head(1)

Unnamed: 0,title,isDS,isDA,isDE,isML,isAI,isBI,isHead,isLead,isManager,...,salary,salary_freq,isRange,salary_range_min,salary_range_max,salary_avg,salary_avg_USD,target,age,summary
0,Data Scientist,1,0,0,0,0,0,0,0,0,...,"$80,000 - $110,000 a year",year,Range,80000,110000,95000.0,95000.0,50_75,10 days ago,We are looking for passionate individuals to h...


In [117]:
indeed.columns

Index(['title', 'isDS', 'isDA', 'isDE', 'isML', 'isAI', 'isBI', 'isHead',
       'isLead', 'isManager', 'isSenior', 'isJunior', 'isGraduate', 'company',
       'rating', 'search_city', 'location', 'salary', 'salary_freq', 'isRange',
       'salary_range_min', 'salary_range_max', 'salary_avg', 'salary_avg_USD',
       'target', 'age', 'summary'],
      dtype='object')

In [129]:
# regressing with title features, plus location:

X = pd.concat([indeed.loc[:,'isDS':'isGraduate'],indeed.loc[:,'search_city']],axis=1)
X_dum = pd.get_dummies(X, columns=['search_city'], drop_first=True)
y = indeed.target

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
print(cross_val_score(lr,X_dum,y,cv=kf).mean())

0.47693446871529055


In [115]:
tvec.fit(indeed['summary'])
X = tvec.transform(indeed['summary'])
y = indeed.target

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
print(cross_val_score(lr,X,y,cv=kf).mean())
lr.fit(X,y)
pd.DataFrame(dict(coef=lr.coef_[0], features=tvec.get_feature_names())).sort_values('coef',ascending=False).head(10)

0.4878193261754905


Unnamed: 0,coef,features
70,1.219381,analysis
911,0.927822,junior
1350,0.912383,project
1125,0.851495,new
739,0.761609,graduate
396,0.749649,customer
405,0.71414,data
414,0.712336,day
73,0.709726,analyst
1859,0.648992,visualisation


In [116]:
pd.DataFrame(dict(coef=lr.coef_[0], features=tvec.get_feature_names())).sort_values('coef',ascending=False).tail(10)

Unnamed: 0,coef,features
1274,-0.478755,platform
1376,-0.48801,python
1062,-0.527155,ml
32,-0.568752,advanced
167,-0.686214,benefits
1503,-0.707369,salary
943,-0.908318,learning
981,-1.062702,machine
1523,-1.355284,science
1526,-1.490732,scientist


In [133]:
# regressing with the addition of NLP on summary field:
pd.set_option('display.max_columns',100)
indeed.head()

Unnamed: 0,title,isDS,isDA,isDE,isML,isAI,isBI,isHead,isLead,isManager,isSenior,isJunior,isGraduate,company,rating,search_city,location,salary,salary_freq,isRange,salary_range_min,salary_range_max,salary_avg,salary_avg_USD,target,age,summary
0,Data Scientist,1,0,0,0,0,0,0,0,0,0,0,0,TheMathCompany Inc.,,Chicago,"Chicago, IL","$80,000 - $110,000 a year",year,Range,80000,110000,95000.0,95000.0,50_75,10 days ago,We are looking for passionate individuals to h...
1,AI/Machine learning engineer,0,0,0,1,1,0,0,0,0,0,0,0,HGS Digital,,Chicago,"Chicago, IL","$85,000 - $110,000 a year",year,Range,85000,110000,97500.0,97500.0,50_75,2 days ago,We are looking for a Artificial intelligence a...
2,B2B/B2G Artificial Intelligence - Machine Lear...,0,0,0,1,0,0,0,0,0,0,0,0,hoytNIVA,,Chicago,"Chicago, IL 60606 (Loop area)","$75,000 - $175,000 a year",year,Range,75000,175000,125000.0,125000.0,75_100,9 days ago,"Active Security Clearances Preferred (Secret, ..."
3,Statistician,0,0,0,0,0,0,0,0,0,0,0,0,US Department of Health And Human Services,4.1,Phoenix,"Phoenix, AZ","$66,311 - $103,328 a year",year,Range,66311,103328,84819.5,84819.5,50_75,Today,If you are selected for this position you will...
4,Data Architect,0,0,0,0,0,0,0,0,0,0,0,0,Arizona State University,4.3,Phoenix,"Tempe, AZ","$135,000 - $165,000 a year",year,Range,135000,165000,150000.0,150000.0,75_100,29 days ago,"Under administrative direction, performs work ..."
