In [1]:
from bs4 import BeautifulSoup
import urllib2
import pickle
from time import sleep
import requests
import re
import pandas as pd
import numpy as np

# Question 1

In [2]:
pickle_in = open("indeed_data_df.pickle","rb")
indeed_data_df = pickle.load(pickle_in)

In [3]:
indeed_data_df['Employer'] = indeed_data_df['Employer'].map(lambda x: 'Employer not stated' if x == '' else x)

### Cleaning scraped data

Getting rid of job titles that are obviously not data science related.

In [4]:
mask = indeed_data_df['Job Title'].str.contains('CLERK')
indeed_data_df = indeed_data_df[mask == False]

mask_2 = indeed_data_df['Job Title'].str.contains('Clerk')
indeed_data_df = indeed_data_df[mask_2 == False]

mask_3 = indeed_data_df['Job Title'].str.contains('Data Entry')
indeed_data_df = indeed_data_df[mask_3 == False]

mask_4 = indeed_data_df['Salary'].str.contains('hour')
indeed_data_df = indeed_data_df[mask_4 == False]

mask_5 = indeed_data_df['Job Title'].str.contains('Recep')
indeed_data_df = indeed_data_df[mask_5 == False]

mask_6 = indeed_data_df['Job Title'].str.contains('Admin Assistant')
indeed_data_df = indeed_data_df[mask_6 == False]

mask_7 = indeed_data_df['Job Title'].str.contains('Agent')
indeed_data_df = indeed_data_df[mask_7 == False]

mask_9 = indeed_data_df['Job Title'].str.contains('Customer Service')
indeed_data_df = indeed_data_df[mask_9 == False]

mask_10 = indeed_data_df['Job Title'].str.contains('ADMIN SUPPORT ASSISTANT')
indeed_data_df = indeed_data_df[mask_10 == False]

mask_11 = indeed_data_df['Job Title'].str.contains('LIBRARY ADMINISTRATOR')
indeed_data_df = indeed_data_df[mask_11 == False]

mask_12 = indeed_data_df['Job Title'].str.contains('Consumer Careline Executives')
indeed_data_df = indeed_data_df[mask_12 == False]


In [5]:
indeed_data_df.drop(531,axis=0,inplace=True)

### Because one of the salaries stated is annual instead of monthly, it has to be isolated and dealth with.

Created a new column where payment terms of each job are given a tag. 0 if salary is not stated, 1 if salary stated is monthly salary and 2 if salary stated is annual salary.

In [6]:
indeed_data_df['Payment Term'] = indeed_data_df['Salary'].map(lambda x: 0 if x == 'Salary not stated' 
                                                               else 1 if 'month' in x 
                                                               else 2 if 'year' in x 
                                                               else x)

### Cleaning salary column

In [7]:
indeed_data_df['Salary'] = indeed_data_df['Salary'].str.replace('a month','')
indeed_data_df['Salary'] = indeed_data_df['Salary'].str.replace('a year','')
indeed_data_df['Salary'] = indeed_data_df['Salary'].str.replace('$','')
indeed_data_df['Salary'] = indeed_data_df['Salary'].str.replace('- ','')
indeed_data_df['Salary'] = indeed_data_df['Salary'].str.replace(',','')

indeed_data_df['Salary'] = indeed_data_df['Salary'].map(lambda x: str(x)[:-1])

In [8]:
indeed_data_df['Salary']

0      Salary not state
6             2500 3500
9      Salary not state
11     Salary not state
13     Salary not state
14     Salary not state
15     Salary not state
17     Salary not state
20                 2500
21     Salary not state
23            5000 8500
26     Salary not state
27     Salary not state
28                 3500
29     Salary not state
30            2600 2800
32     Salary not state
33     Salary not state
34     Salary not state
36            3500 4500
38     Salary not state
39     Salary not state
40     Salary not state
41     Salary not state
42     Salary not state
43     Salary not state
44     Salary not state
45            3000 5000
46            4000 7000
47     Salary not state
             ...       
966    Salary not state
967    Salary not state
968    Salary not state
969    Salary not state
970    Salary not state
971    Salary not state
972    Salary not state
973    Salary not state
974    Salary not state
975    Salary not state
976    Salary no

### Salaries are stated in different formats. Some are stated as a range while others have a single value.

The for-loops below seek to convert salaries with a **single value** into floats for easier handling.

In [9]:
salary_list = []

for i in indeed_data_df['Salary'].values:
    salary_list.append(i)

In [10]:
# Converting salaries with single value to float

for index,item in enumerate(salary_list):
    if len(item) == 4:
        salary_list[index] = float(item)

The for-loops below seek to convert salaries that are stated as ranges into floats such that their **mean** may be tabulated for use as the dependent variable.

In [11]:
# Converting salaries that range between 2 4-figure numbers to float and 
# calculating mean monthly salary.

for index,item in enumerate(salary_list):
    try:
        if len(item) == 9:
            salary_list[index] = item.split()
    except:
        continue

for index,item in enumerate(salary_list):
    if type(item) == list:
        for x,y in enumerate(item):
            item[x] = float(y)

for index,item in enumerate(salary_list):
    if type(item) == list:
        salary_list[index] = (item[0] + item[1])/2

In [12]:
# Converting salaries that range between 2 non-4-figure numbers to float 
# and calculating mean monthly salary.

for index,item in enumerate(salary_list):
    try:
        if len(item) == 10:
            salary_list[index] = item.split()
        elif len(item) == 8:
            salary_list[index] = item.split()
    except:
        continue

for index,item in enumerate(salary_list):
    if type(item) == list:
        for x,y in enumerate(item):
            item[x] = float(y)

for index,item in enumerate(salary_list):
    if type(item) == list:
        salary_list[index] = (item[0] + item[1])/2

In [13]:
# Converting annual salary to float and calculating the mean monthly salary.

for index,item in enumerate(salary_list):
    try:
        if len(item) == 11:
            salary_list[index] = item.split()
    except:
        continue

for index,item in enumerate(salary_list):
    if type(item) == list:
        for x,y in enumerate(item):
            item[x] = float(y)

for index,item in enumerate(salary_list):
    if type(item) == list:
        salary_list[index] = ((item[0]/12) + item[1]/12)/2

In [14]:
# Converting list of salaries to dataframe for concatenation with main dataframe

salary_list_df = pd.DataFrame(salary_list,columns=['Monthly Salary'])

test_df = indeed_data_df.reset_index(drop=True)

indeed_data_df = pd.concat([test_df,salary_list_df],axis=1)

indeed_data_df.drop(['Salary','Payment Term'], axis=1,inplace=True)

### Cleaning Job Description

In [15]:
# Removing \n from job description
indeed_data_df['Job Description'] = indeed_data_df['Job Description'].str.replace('\n', '. ')

In [16]:
# Removing 'Job Summary' from the start of each job description

for i in indeed_data_df.iterrows():
    indeed_data_df.ix[i[0],'Job Description'] = re.sub('Job Summary','',i[1][3])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [17]:
# Removing 'Job Description' from the start of each job description

for i in indeed_data_df.iterrows():
    indeed_data_df.ix[i[0],'Job Description'] = re.sub('Job Description','',i[1][3])

In [18]:
for i in indeed_data_df.iterrows():
    indeed_data_df.ix[i[0],'Job Description'] = re.sub('"',"'",i[1][3])

### Splitting main dataframe into 2. 

One with salary values and one without. The one with salary values will be used to train a model to predict salaries for the dataframe without salary values.

Salary values will be divided into 2 groups. 0 (for normal salary) and 1 (for high salary). The divide between normal and high will be the median of all salary data which have been obtained.

In [19]:
#Splitting into 2 dataframes

salary_df = indeed_data_df[indeed_data_df['Monthly Salary'] != 'Salary not state']

no_salary_df = indeed_data_df[indeed_data_df['Monthly Salary'] == 'Salary not state']

In [20]:
# Finding the median salary

np.median(salary_df['Monthly Salary'].values)

3000.0

In [21]:
# Creating new column to show whether salary is normal or high. Values in this
# column will be used as the dependent variable in model building.

indeed_data_df['Salary Range'] = indeed_data_df['Monthly Salary'].map(lambda x: 0 if x == 'Salary not state' else 1 if x <= 3000 else 2)

model_data = indeed_data_df.drop('Monthly Salary',axis=1)

In [22]:
salary_df = model_data[model_data['Salary Range'] != 0]

no_salary_df = model_data[model_data['Salary Range'] == 0]

# Model Building

Count Vectorizer and then **logistic regression** vs **random forest**.

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold, cross_val_score, train_test_split


In [24]:
cols = list(salary_df.columns)
cols.remove('Salary Range')

In [25]:
# model = make_pipeline(TfidfVectorizer(stop_words='english',
#                                       sublinear_tf=True,
#                                       max_df=0.5,
#                                       max_features=1000),
#                       LogisticRegression(),
#                       )
# model.fit(data_train['data'], y_train)
# y_pred = model.predict(data_test['data'])
# print accuracy_score(y_test, y_pred)
# print "Number of features:", len(model.steps[0][1].get_feature_names())

### Setting up countvectorizer

In [26]:
# X is essentially the job description column but vectorized

cvec = CountVectorizer(stop_words='english',
                       ngram_range=(2,4),
                       max_df=1.0,
                       max_features=5000)

cvec.fit(salary_df['Job Description'])


CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(2, 4), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
len(cvec.get_feature_names())

5000

In [28]:
X_train = pd.DataFrame(cvec.transform(salary_df['Job Description']).todense(),
                       columns=cvec.get_feature_names())

In [29]:
word_counts = X_train.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

000 00                         32
500 00                         29
00 monthrequired               27
type timesalary                26
monthrequired experience       23
00 monthrequired experience    23
data centre                    18
00 month                       18
job type                       15
00 500                         13
00 500 00                      13
able work                      12
00 000                         12
based client instructions      12
00 000 00                      12
based client                   12
client instructions            12
type contractsalary            11
working experience             11
data center                    11
dtype: int64

In [30]:
y_train = salary_df['Salary Range']

### Model Building

Logistic Regression

In [31]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
kf_shuffle = KFold(n_splits=5,shuffle=True)
scores = cross_val_score(lr, X_train, y_train, cv=kf_shuffle)

In [32]:
print np.mean(scores)

0.8


Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier
rtc = RandomForestClassifier(random_state=1,n_jobs=-1)
rtc_scores = cross_val_score(rtc, X_train, y_train, cv=kf_shuffle)

In [34]:
print np.mean(rtc_scores)

0.636363636364


### Prediction

Since logistic regression scored better, it will be used to predict salaries for the other job postings.

In [35]:
X_test = pd.DataFrame(cvec.transform(no_salary_df['Job Description']).todense(),
                       columns=cvec.get_feature_names())

In [36]:
model = lr.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [37]:
len(y_pred)

805

# Question 2

### Data scientist position vs non-data scientist position

In [38]:
indeed_data_df['Job Level'] = indeed_data_df['Job Title'].map(lambda x: 1 if x.find('scientist') > 0 
                                                                    else 1 if x.find('Scientist') > 0 
                                                                    else 0)

In [39]:
# Data is imbalanced. Got unicode error when appling resampling techniques.

indeed_data_df['Job Level'].value_counts()

0    783
1     77
Name: Job Level, dtype: int64

In [40]:
print "Baseline : ", 783./(783.+77.)

Baseline :  0.910465116279


In [41]:
X_2 = indeed_data_df['Job Description']
y_2 = indeed_data_df['Job Level']

In [42]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_2, y_2, test_size=0.3)

### TFIDF Vectorizer 

Because we are trying to predict a job position based on the job description, it makes sense to ignore words that occur throughout job descriptions. We are interested in words in the job description that are specific to the position of a data scientist. 

Since TFIDF imposes a penalty on words that appear more frequently, it seems like a better choice than count vectorizer as certain words may be 

In [43]:
tvect = TfidfVectorizer(stop_words='english',sublinear_tf=True,max_df=0.9,
                        max_features=5000,norm='l1',ngram_range=(1,5))

tvect.fit(X2_train)

X2_train = pd.DataFrame(tvect.transform(X2_train).todense(),
                       columns=tvect.get_feature_names())

X2_test = pd.DataFrame(tvect.transform(X2_test).todense(),
                       columns=tvect.get_feature_names())


In [44]:
len(y2_train)

602

In [45]:
len(X2_test)

258

In [46]:
model_lr_tvect = lr.fit(X2_train,y2_train)
y2_pred = model_lr_tvect.predict(X2_test)

In [47]:
len(y2_pred)

258

In [48]:
print accuracy_score(y2_test, y2_pred)

0.926356589147


In [49]:
tvect.get_feature_names()

[u'00',
 u'00 000',
 u'00 000 00',
 u'00 200',
 u'00 200 00',
 u'00 500',
 u'00 500 00',
 u'00 month',
 u'00 monthrequired',
 u'00 monthrequired experience',
 u'000',
 u'000 00',
 u'000 00 month',
 u'000 employees',
 u'000 employees major',
 u'000 employees major financial',
 u'000 employees major financial centers',
 u'10',
 u'10 years',
 u'10 years experience',
 u'100',
 u'100 compliance',
 u'11',
 u'12',
 u'12 months',
 u'15',
 u'15 years',
 u'18',
 u'20',
 u'200',
 u'200 00',
 u'2008',
 u'2008 2012',
 u'2012',
 u'2014',
 u'2017',
 u'2018',
 u'24',
 u'24x7',
 u'30',
 u'3rd',
 u'3rd party',
 u'40',
 u'50',
 u'50 countries',
 u'50 countries want',
 u'50 countries want things',
 u'50 countries want things offer',
 u'500',
 u'500 00',
 u'500 00 month',
 u'60',
 u'60 000',
 u'60 000 employees',
 u'60 000 employees major',
 u'60 000 employees major financial',
 u'600',
 u'600 00',
 u'80',
 u'800',
 u'800 00',
 u'aa',
 u'aasp',
 u'abbott',
 u'abilities',
 u'ability',
 u'ability analyze',
 