In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

%config InlineBackend.figure_format = 'svg'

In [14]:
df = pd.read_csv('../data/salary-train.csv.gz', compression='gzip')
df_test = pd.read_csv('../data/salary-test-mini.csv.gz', compression='gzip')

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer

In [18]:
df['LocationNormalized'].value_counts()

UK                     9980
London                 7563
South East London      2862
The City               1677
Manchester              833
Leeds                   810
Birmingham              760
Central London          624
West Midlands           612
Surrey                  554
Reading                 540
Bristol                 507
Nottingham              451
Aberdeen                446
Sheffield               437
Milton Keynes           398
Hampshire               388
Belfast                 378
East Sheen              377
Oxford                  370
Berkshire               365
Newcastle Upon Tyne     360
Kent                    337
Liverpool               319
Cambridge               295
Leicester               295
Oxfordshire             275
West Yorkshire          268
North West London       264
Northampton             264
                       ... 
Preston Brook             1
Blantyre                  1
Pelaw                     1
Channel Islands           1
Halesworth          

In [24]:
len(df[df['ContractTime'].isnull()])

15582

In [25]:
df['ContractTime'].value_counts()

permanent    37169
contract      7249
Name: ContractTime, dtype: int64

In [26]:
df['ContractTime'].fillna('nan', inplace=True)

In [27]:
df.dtypes

FullDescription       object
LocationNormalized    object
ContractTime          object
SalaryNormalized       int64
dtype: object

In [28]:
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [29]:
df['FullDescription'] = df['FullDescription'].apply(lambda it: it.lower())

In [31]:
df['FullDescription'] = df['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [32]:
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355


In [35]:
vectorizer = TfidfVectorizer(min_df=5)

In [40]:
X_train_tfidf = vectorizer.fit_transform(df['FullDescription'])

In [41]:
enc = DictVectorizer()

In [43]:
X_train_categ = enc.fit_transform(df[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [44]:
from scipy.sparse import hstack

In [48]:
X_train = hstack([X_train_tfidf, X_train_categ])

In [49]:
reg_model = Ridge(alpha=1, random_state=241)

In [50]:
Y_train = df['SalaryNormalized']

In [51]:
reg_model.fit(X_train, Y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [52]:
df_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [53]:
df_test['FullDescription'] = df['FullDescription'].apply(lambda it: it.lower())

In [56]:
df_test = pd.read_csv('../data/salary-test-mini.csv.gz', compression='gzip')
df_test['FullDescription'] = df_test['FullDescription'].apply(lambda it: it.lower())
df_test['FullDescription'] = df_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

X_test_tfidf = vectorizer.transform(df_test['FullDescription'])
X_test_categ = enc.transform(df_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

X_test = hstack([X_test_tfidf, X_test_categ])

In [58]:
reg_model.predict(X_test)

array([56555.61500155, 37188.32442618])