In [37]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import linear_model
from sklearn import feature_extraction

In [38]:
# data reading
train = pd.read_csv('salary-train.csv')
test = pd.read_csv('salary-test-mini.csv')

In [39]:
# data preprocessing
# lowercase
train['FullDescription'] = train['FullDescription'].str.lower()
test['FullDescription'] = test['FullDescription'].str.lower()

In [40]:
# replace all but chars&numbers
train['FullDescription'] = train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
test['FullDescription'] = test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [41]:
# TF-IDF
encoder_tf = feature_extraction.text.TfidfVectorizer(min_df=5)
train_tf = encoder_tf.fit_transform(train['FullDescription'])
test_tf = encoder_tf.transform(test['FullDescription'])

In [42]:
# fill NaN
train['LocationNormalized'].fillna('nan', inplace=True)
train['ContractTime'].fillna('nan', inplace=True)

In [43]:
# DictVectorizer
encoder_dv = feature_extraction.DictVectorizer()
train_cat = encoder_dv.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
test_cat = encoder_dv.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [47]:
# stack features into one matrix
train_matrix = sp.sparse.hstack([train_tf, train_cat])
test_matrix = sp.sparse.hstack([test_tf, test_cat])

In [50]:
# model fitting
reg = linear_model.Ridge(alpha=1.0)
reg.fit(train_matrix, train['SalaryNormalized'])

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [51]:
# answer for question
predictions = reg.predict(test_matrix)
print(predictions)

[ 56569.24475536  37184.23346545]
