In [1]:
from pandas import Series, DataFrame
import nltk
import pandas as pd
from patsy import dmatrices
%pylab inline
import warnings
warnings.filterwarnings('ignore')
import random

Populating the interactive namespace from numpy and matplotlib


In [2]:
#Reading training data
data = pd.read_csv('data/Train_rev1.csv')
data.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,"Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...","Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk


In [3]:
data.dtypes

Id                     int64
Title                 object
FullDescription       object
LocationRaw           object
LocationNormalized    object
ContractType          object
ContractTime          object
Company               object
Category              object
SalaryRaw             object
SalaryNormalized       int64
SourceName            object
dtype: object

### B1
#### Model with numeric columns only

In [4]:
data_s = data[['LocationNormalized','ContractType','ContractTime','Category','SalaryNormalized']]
print(data_s.shape)
data_s.head()

(244768, 5)


Unnamed: 0,LocationNormalized,ContractType,ContractTime,Category,SalaryNormalized
0,Dorking,,permanent,Engineering Jobs,25000
1,Glasgow,,permanent,Engineering Jobs,30000
2,Hampshire,,permanent,Engineering Jobs,30000
3,Surrey,,permanent,Engineering Jobs,27500
4,Surrey,,permanent,Engineering Jobs,25000


In [5]:
#Checking NA in Contract Type
data_s.ContractType.value_counts(dropna=False)

NaN          179326
full_time     57538
part_time      7904
Name: ContractType, dtype: int64

Since ~73% of ContractType is missing, we will not be using this column for our classification. Replacing NaN with "Full Time" will bias the data

In [6]:
data_s = data[['LocationNormalized','ContractTime','Category','SalaryNormalized']]
print(data_s.shape)
df = data_s.dropna()
df.shape

(244768, 4)


(180863, 4)

By dropping all rows with missing values, we lost about 65K rows (~26%). We will be using the clean dataset going forward

In [7]:
p=np.percentile(df['SalaryNormalized'],75)
def target(t):
    if t>p:
        return 1
    else:
        return 0
    
df['target'] = df['SalaryNormalized'].map(target)

#### Get a list of cities with highest cost of living.

In [8]:
#Get the top 10 highest CoL data from https://abcfinance.co.uk/blog/the-true-cost-of-living-in-uk-cities/
high_cost = ['London','Milton Keynes','Bath','Reading','Aberdeen','Cambridge','Oxford','Portsmouth','Edinburgh','York']

In [9]:
def location_class(s):
    if s in high_cost:
        return 'high'
    else:
        return 'low'
df['location_class'] = df['LocationNormalized'].map(location_class)

Now, get the dummies for each variables.

In [10]:
df.columns.values

array(['LocationNormalized', 'ContractTime', 'Category',
       'SalaryNormalized', 'target', 'location_class'], dtype=object)

In [11]:
categorical_columns = ['ContractTime', 'Category','location_class']
data_dummies = pd.get_dummies(df[categorical_columns],
                            prefix=categorical_columns,
                            columns=categorical_columns)
dummy_column_names = data_dummies.columns.values
dummy_column_names

array(['ContractTime_contract', 'ContractTime_permanent',
       'Category_Accounting & Finance Jobs', 'Category_Admin Jobs',
       'Category_Charity & Voluntary Jobs', 'Category_Consultancy Jobs',
       'Category_Creative & Design Jobs',
       'Category_Customer Services Jobs',
       'Category_Domestic help & Cleaning Jobs',
       'Category_Energy, Oil & Gas Jobs', 'Category_Engineering Jobs',
       'Category_Graduate Jobs', 'Category_HR & Recruitment Jobs',
       'Category_Healthcare & Nursing Jobs',
       'Category_Hospitality & Catering Jobs', 'Category_IT Jobs',
       'Category_Legal Jobs', 'Category_Logistics & Warehouse Jobs',
       'Category_Maintenance Jobs', 'Category_Manufacturing Jobs',
       'Category_Other/General Jobs',
       'Category_PR, Advertising & Marketing Jobs',
       'Category_Part time Jobs', 'Category_Property Jobs',
       'Category_Retail Jobs', 'Category_Sales Jobs',
       'Category_Scientific & QA Jobs', 'Category_Social work Jobs',
       'C

In [12]:
df2 = pd.concat([df, data_dummies], axis=1)
df2.columns.values

array(['LocationNormalized', 'ContractTime', 'Category',
       'SalaryNormalized', 'target', 'location_class',
       'ContractTime_contract', 'ContractTime_permanent',
       'Category_Accounting & Finance Jobs', 'Category_Admin Jobs',
       'Category_Charity & Voluntary Jobs', 'Category_Consultancy Jobs',
       'Category_Creative & Design Jobs',
       'Category_Customer Services Jobs',
       'Category_Domestic help & Cleaning Jobs',
       'Category_Energy, Oil & Gas Jobs', 'Category_Engineering Jobs',
       'Category_Graduate Jobs', 'Category_HR & Recruitment Jobs',
       'Category_Healthcare & Nursing Jobs',
       'Category_Hospitality & Catering Jobs', 'Category_IT Jobs',
       'Category_Legal Jobs', 'Category_Logistics & Warehouse Jobs',
       'Category_Maintenance Jobs', 'Category_Manufacturing Jobs',
       'Category_Other/General Jobs',
       'Category_PR, Advertising & Marketing Jobs',
       'Category_Part time Jobs', 'Category_Property Jobs',
       'Category_Ret

Now, build Naive Bayes model.

Creating equation

In [13]:
formula = 'target ~ 0 + {}'.format(' + '.join(['Q("{}")'.format(x) for x in dummy_column_names]))

In [14]:
Y, X = dmatrices(formula, df2, return_type='dataframe')
y = Y['target'].values

Since there is no validation dataset, we will create a testing/training sample here

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [17]:
from sklearn import naive_bayes
model = naive_bayes.BernoulliNB()
model.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [18]:
#Checking training accuracy
from sklearn import metrics
prediction_train = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction_train))

0.7600213789417424


The accuracy of using numerical variables are 76.06% on the training data

In [19]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, prediction_train))

[[38624  2058]
 [10963  2614]]


#### Model with text columns only

Creating the dataset for classification

In [20]:
data.columns.values

array(['Id', 'Title', 'FullDescription', 'LocationRaw',
       'LocationNormalized', 'ContractType', 'ContractTime', 'Company',
       'Category', 'SalaryRaw', 'SalaryNormalized', 'SourceName'],
      dtype=object)

In [21]:
data_s = data[['FullDescription','SalaryNormalized']]

Creating target variable

In [22]:
p=np.percentile(data_s['SalaryNormalized'],75)
def target(t):
    if t>p:
        return 'high'
    else:
        return 'low'
    
data_s['target'] = data_s['SalaryNormalized'].map(target)

Taking a sample of 2500 rows

In [23]:
random.seed(99)
#sample = random.sample(range(len(data_s)),0.7 * len(data_s))
sample = random.sample(range(len(data_s)),2500)
df = data_s.loc[sample,:]
print(df.shape)

(2500, 3)


In [24]:
df.head()

Unnamed: 0,FullDescription,SalaryNormalized,target
105900,"Passionate about making lives better, Bupa is ...",24500,low
99812,Category Manager Milton Keynes High profile r...,50000,high
52448,The Company: Our client enjoys a high profile ...,27500,low
157139,PHP Web Developer A Leading digital agency ar...,14999,low
46871,"Due to continued growth, this new opportunity ...",42500,low


Cleaning the job description by:
1. Removing punctuation
2. Getting rid of stop words
3. Removing Numbers
4. Stripping excess whitespace

In [25]:
import re
from nltk.corpus import stopwords
#removing punctuation
df['job_des'] = df.FullDescription.apply(lambda x:re.sub(r'[^a-zA-z\s]', ' ', x.lower()))

In [26]:
#remove white spaces
df['job_des'] = df.job_des.apply(lambda x:re.sub(r'\s+', ' ', x))

In [27]:
#remove stopwords
stop = set(stopwords.words('english'))
df['job_des_clean'] = df.job_des.apply(lambda x: [word for word in x.split() if word not in stop])
df.head()

Unnamed: 0,FullDescription,SalaryNormalized,target,job_des,job_des_clean
105900,"Passionate about making lives better, Bupa is ...",24500,low,passionate about making lives better bupa is d...,"[passionate, making, lives, better, bupa, dedi..."
99812,Category Manager Milton Keynes High profile r...,50000,high,category manager milton keynes high profile ro...,"[category, manager, milton, keynes, high, prof..."
52448,The Company: Our client enjoys a high profile ...,27500,low,the company our client enjoys a high profile i...,"[company, client, enjoys, high, profile, fasci..."
157139,PHP Web Developer A Leading digital agency ar...,14999,low,php web developer a leading digital agency are...,"[php, web, developer, leading, digital, agency..."
46871,"Due to continued growth, this new opportunity ...",42500,low,due to continued growth this new opportunity h...,"[due, continued, growth, new, opportunity, ari..."


Creating training data for Naive Bayes

Using just first 2000 words as features

In [28]:
job_des_all = df['job_des_clean'].sum()

In [29]:
all_words = nltk.FreqDist(job_des_all)
word_features = list(all_words)[:2000]
len(word_features)

2000

In [30]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [31]:
df2 = df[['target','job_des_clean']]
t = list(zip(df2.melt('target').value,df2.melt('job_des_clean').value))

In [32]:
featuresets = [(document_features(x[0]), x[1]) for x in t]

In [33]:
train_set, test_set = featuresets[2000:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [34]:
print(nltk.classify.accuracy(classifier, test_set))

0.796


In [35]:
classifier.show_most_informative_features(5)

Most Informative Features
         contains(risks) = True             high : low    =     13.9 : 1.0
    contains(definition) = True             high : low    =     13.9 : 1.0
  contains(demonstrated) = True             high : low    =      9.8 : 1.0
    contains(investment) = True             high : low    =      9.7 : 1.0
       contains(options) = True             high : low    =      8.8 : 1.0
