In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

data_csv = 'data/training_data_20171002_no_job_description.csv'

df = pd.read_csv(data_csv)

In [2]:
df.head(10)

Unnamed: 0,Id,Title,LocationNormalized,ContractTime,SalaryNormalized,latitude,longitude,country,Birmingham_onehot,Glasgow_onehot,...,citrix_onehot,tsql_onehot,ssis_onehot,ruby_onehot,sas_onehot,ssrs_onehot,python_onehot,seo_onehot,tcp_onehot,css3_onehot
0,69974398,Technical Support Supervisor,abbots langley,permanent,25000,51.706638,-0.417807,england,0,0,...,0,0,1,0,0,0,0,0,0,0
1,71084222,"Workshop Technician, Hertfordshire, Motor Trad...",abbots langley,permanent,23000,51.706638,-0.417807,england,0,0,...,0,0,0,0,0,0,0,0,0,0
2,67619031,Mechanical Design Engineer,aberaeron,permanent,35000,52.243177,-4.260015,wales,0,0,...,0,0,0,0,0,0,0,0,0,0
3,69894639,Software Team Lead Critical safety/ C,aberaeron,permanent,55000,52.243177,-4.260015,wales,0,0,...,0,0,1,0,0,0,0,0,0,0
4,71525354,Mechanical Design Engineer,aberaeron,permanent,35000,52.243177,-4.260015,wales,0,0,...,0,0,0,0,0,0,0,0,0,0
5,68509706,ICT Support Officer,aberdare,permanent,22958,51.713353,-3.445555,wales,1,0,...,0,0,0,0,0,0,0,0,0,0
6,64130329,NET Developer Music Streaming Aberdeen,aberdeen,permanent,40000,57.145245,-2.091374,scotland,0,0,...,0,0,0,0,0,0,0,0,0,0
7,64796854,Contract SharePoint Developer,aberdeen,contract,86400,57.145245,-2.091374,scotland,0,0,...,0,0,0,0,0,0,0,0,0,0
8,65171361,Java Developer Aberdeen Salary Negotiable,aberdeen,permanent,35000,57.145245,-2.091374,scotland,0,0,...,0,0,0,0,0,0,0,0,0,0
9,65437063,SharePoint Developer,aberdeen,contract,70800,57.145245,-2.091374,scotland,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
import numpy
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from sklearn.preprocessing import MinMaxScaler
import nltk
from sklearn.preprocessing import MinMaxScaler

# Change text columns to numbers
locationNormalized = set(df['LocationNormalized'])
locationNormalized_dict = {x:float(i) for i,x in enumerate(locationNormalized)}
df['LocationNormalized'] = df['LocationNormalized'].map(locationNormalized_dict)

ContractTime = set(df['ContractTime'])
ContractTime_dict = {x:float(i) for i,x in enumerate(ContractTime)}
df['ContractTime'] = df['ContractTime'].map(ContractTime_dict)

country = set(df['country'])
country_dict = {x:float(i) for i,x in enumerate(country)}
df['country'] = df['country'].map(country_dict)

# target column
y = df['SalaryNormalized']

# tokenize Title text, 
# convert titles to number sequences,
# and separate each word into its own dataframe column (df_titles_to_df)
X = df['Title']
remove_words = ["\*\*\*k"," to "," and "," in "," the "," of "," on "," or ", " for "," a "]
for wd in remove_words:
    X = X.str.replace(wd,"")

wordcount = 4500
tokenizer = Tokenizer(num_words=wordcount, filters='!"#$€%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=" ",)
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) 
word_index = tokenizer.word_index
#print(word_index)
word_index
reverse_word_index = {v: k for k, v in word_index.items()}

XT_unscaled = tokenizer.texts_to_sequences(X)
dft = pd.DataFrame({'Title_num':XT_unscaled})

df_titles_to_df = dft['Title_num'].apply(pd.Series)
df_titles_to_df.fillna(0, inplace=True) 

# Combine new title token columns and old columns, remove what's not needed
df_new = pd.concat([df.drop('Id', 1).drop('Title', 1).drop('SalaryNormalized',1), df_titles_to_df.iloc[:,:10]], axis=1)

# Normalize the numeric values
min_max_scaler = MinMaxScaler() 
x = df_new.values #returns a numpy array
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled, columns = df_new.columns)

print(df_normalized.head())

X_train, X_test, y_train, y_test = train_test_split(df_normalized, y, test_size=0.2, random_state=42)

Using TensorFlow backend.


   LocationNormalized  ContractTime  latitude  longitude  country  \
0            0.688670           1.0  0.214085   0.760591      1.0   
1            0.688670           1.0  0.214085   0.760591      1.0   
2            0.758621           1.0  0.276724   0.337001      0.0   
3            0.758621           1.0  0.276724   0.337001      0.0   
4            0.758621           1.0  0.276724   0.337001      0.0   

   Birmingham_onehot  Glasgow_onehot  Leeds_onehot  Liverpool_onehot  \
0                0.0             0.0           0.0               0.0   
1                0.0             0.0           0.0               0.0   
2                0.0             0.0           0.0               1.0   
3                0.0             0.0           0.0               1.0   
4                0.0             0.0           0.0               1.0   

   London_onehot ...          0         1         2         3         4  \
0            1.0 ...   0.003779  0.001780  0.140089  0.000000  0.000000   
1 

In [4]:
# summarize size
print("Training data: ")
print(df_new.shape)
print(y.shape)

Training data: 
(38483, 54)
(38483,)


## Build the Model using Keras

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras import backend as K

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# to be used to calculate R2
def get_r2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) ) * 100

#create the model
model = Sequential()
model.add(Dense(650, input_dim=54, activation='relu'))
model.add(Dense(400, activation='relu'))
model.add(Dense(130, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='MSE', optimizer='Adamax', metrics=[get_r2])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 650)               35750     
_________________________________________________________________
dense_2 (Dense)              (None, 400)               260400    
_________________________________________________________________
dense_3 (Dense)              (None, 130)               52130     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 131       
Total params: 348,411
Trainable params: 348,411
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
## Fit the model
# Changing data to numpy arrays
X_train_np = np.array(X_train)
y_train_np = np.array(y_train)
X_test_np = np.array(X_test)
y_test_np = np.array(y_test)

model.fit(X_train_np, y_train_np, validation_data=(X_test_np, y_test_np), epochs=110, batch_size=256, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test_np, y_test_np, verbose=0)

Train on 30786 samples, validate on 7697 samples
Epoch 1/110
1s - loss: 1622771454.9148 - get_r2: -3.9387e+02 - val_loss: 497293313.3595 - val_get_r2: -4.9597e+01
Epoch 2/110
1s - loss: 391360447.3659 - get_r2: -1.8344e+01 - val_loss: 343826032.6382 - val_get_r2: -2.9624e+00
Epoch 3/110
1s - loss: 322542796.6686 - get_r2: 2.5215 - val_loss: 314410788.0743 - val_get_r2: 5.8324
Epoch 4/110
1s - loss: 305735579.9712 - get_r2: 7.6117 - val_loss: 302499941.8890 - val_get_r2: 9.4303
Epoch 5/110
1s - loss: 295810349.5085 - get_r2: 10.5338 - val_loss: 292756550.3838 - val_get_r2: 12.2744
Epoch 6/110
1s - loss: 287200132.3324 - get_r2: 13.1828 - val_loss: 283707658.9528 - val_get_r2: 14.9717
Epoch 7/110
1s - loss: 280000945.6932 - get_r2: 15.3178 - val_loss: 276482105.0217 - val_get_r2: 17.0993
Epoch 8/110
1s - loss: 274357721.3300 - get_r2: 16.9394 - val_loss: 271607649.7877 - val_get_r2: 18.5119
Epoch 9/110
1s - loss: 270664205.4898 - get_r2: 18.0451 - val_loss: 267919225.3086 - val_get_r2: 1

1s - loss: 255235994.8382 - get_r2: 22.7728 - val_loss: 255117625.5767 - val_get_r2: 23.4045
Epoch 79/110
1s - loss: 254808808.2198 - get_r2: 22.8427 - val_loss: 255559654.2196 - val_get_r2: 23.2545
Epoch 80/110
1s - loss: 254902761.3986 - get_r2: 22.7911 - val_loss: 254786175.7318 - val_get_r2: 23.5127
Epoch 81/110
1s - loss: 254608019.7388 - get_r2: 22.8629 - val_loss: 254989405.8527 - val_get_r2: 23.4263
Epoch 82/110
1s - loss: 254220730.8735 - get_r2: 23.0238 - val_loss: 256199193.8137 - val_get_r2: 23.1365
Epoch 83/110
1s - loss: 254217956.7658 - get_r2: 23.0306 - val_loss: 254770891.1295 - val_get_r2: 23.5430
Epoch 84/110
1s - loss: 253891595.2986 - get_r2: 23.1797 - val_loss: 254191883.3998 - val_get_r2: 23.6821
Epoch 85/110
1s - loss: 253578762.9639 - get_r2: 23.3199 - val_loss: 254628967.0594 - val_get_r2: 23.5788
Epoch 86/110
1s - loss: 253622831.6819 - get_r2: 23.2305 - val_loss: 254894142.0460 - val_get_r2: 23.4365
Epoch 87/110
1s - loss: 253349772.1343 - get_r2: 23.1298 - 

Current best: 
Epoch 100/100  
1s - loss: 249541056.5842 - get_r2: 24.3083 - val_loss: 252095737.9883 - val_get_r2: 24.2949

## Saving the Model

In [8]:
from keras.models import load_model

model.save('models/salary_model_20171015.h5') 