### Salary prediction from job listing

In [5]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from matplotlib import pyplot as plt
% matplotlib inline

In [6]:
train = pd.read_csv("https://raw.githubusercontent.com/ajschumacher/gadsdata/master/salary/train.csv")

In [7]:
train.head(3)

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk


In [5]:
train.ContractType.value_counts()

full_time    2978
part_time     578
Name: ContractType, dtype: int64

#### Constructing Bags of Words for Title and Description:

In [None]:
vect_title = CountVectorizer(stop_words="english",min_df = 5, ngram_range=(1,2)) #we use bigrams here.
vect_descp = CountVectorizer(stop_words="english", min_df = 10, ngram_range=(1,1))
vect_loc = CountVectorizer()

In [None]:
train.Title = train.Title.fillna("") #one NA title here

In [None]:
X_title = vect_title.fit_transform(train.Title)
X_descp = vect_descp.fit_transform(train.FullDescription)
X_loc = vect_loc.fit_transform(train.LocationRaw)

In [None]:
(X_title.shape, X_descp.shape, X_loc.shape)

In [None]:
y = train.SalaryNormalized

#### Constructing Dummy vars for Location, ContractTime, Company, Category, SourceName:

In [None]:
sm_train = train[["Company", "SourceName", "Category", "ContractType"]]

In [None]:
X_other = np.asarray(pd.get_dummies(sm_train))
X_other.shape

#### Now we're ready to do the modelling:

In [None]:
X = np.hstack((X_title.toarray(), X_other)) #using the description bag of words is bad here.
#X = X_other

In [None]:
X = TfidfTransformer().fit_transform(X_descp) 

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state = 1)

In [None]:
model_linear = linear_model.Ridge(alpha = 3)

In [None]:
model_linear.fit(X_train, y_train)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, model_linear.predict(X_test)))

So best is 10228 rmse with Ridge (similar with Lasso) alpha = 3, bigram on whole title.

In [None]:
sum(model_linear.coef_ != 0) #only 1482 nonzero for lasso.

#### Tf-idf versus DTM on description:

In [None]:
X = X_descp
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state = 1)
model_linear = linear_model.Ridge(alpha = 3)
model_linear.fit(X_train, y_train)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, model_linear.predict(X_test)))

In [None]:
X_descp = TfidfVectorizer().fit_transform(train.FullDescription)
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state = 1)
model_linear = linear_model.Ridge(alpha = 3)
model_linear.fit(X_train, y_train)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, model_linear.predict(X_test)))

So Tfidf makes a _huge_ difference in this case. Also the model fits much faster. I wonder why.

### Tf-IDF directly:

In [None]:
X_descp = TfidfVectorizer().fit_transform(train.FullDescription)
X_title = TfidfVectorizer().fit_transform(train.Title)
X_loc = TfidfVectorizer().fit_transform(train.LocationRaw)


In [None]:
X_descp.shape

In [None]:
X = np.hstack((X_title.toarray(),
               X_other,
               X_descp_idf.toarray(),
               X_loc.toarray())) #using the description bag of words is bad here.

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, random_state = 4)
model_linear = linear_model.Ridge(alpha = 3)
model_linear.fit(X_train, y_train)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, model_linear.predict(X_test)))

In [None]:
metrics.mean_absolute_error(y_test, model_linear.predict(X_test))

### What if we had a real holdout set: testing tf-idf

In [8]:
sm_train, val = train_test_split(train, random_state = 3)

In [9]:
vect = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df = 1)
X_train = vect.fit_transform(sm_train.FullDescription)
X_test = vect.transform(val.FullDescription) 

In [10]:
X_train.shape

(7500, 377876)

In [11]:
y_train = sm_train.SalaryNormalized
y_val = val.SalaryNormalized

In [12]:
model = linear_model.Ridge(alpha=0.1)

In [13]:
model.fit(X_train, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [14]:
preds = model.predict(X_test)

In [15]:
np.sqrt(metrics.mean_squared_error(y_val, preds))

9248.2423450850783

In [None]:
sm_train.

### Evaluating final descp only model on test set:

test = pd.read_csv("https://raw.githubusercontent.com/ajschumacher/gadsdata/master/salary/solution.csv")

In [235]:
vect = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df = 1)
X_train = vect.fit_transform(train.FullDescription)
X_test = vect.transform(test.FullDescription) 

In [236]:
X_train.shape

(10000, 456715)

In [None]:
X_train != 0

In [237]:
y_train = train.SalaryNormalized
y_test = test.SalaryNormalized

In [238]:
model = linear_model.Ridge(alpha=0.1)
model.fit(X_train, y_train)
preds = model.predict(X_test)

np.sqrt(metrics.mean_squared_error(y_test, preds))