In [33]:
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack

data_train = pandas.read_csv('salary-train.csv')
data_test = pandas.read_csv('salary-test-mini.csv')

def prepare_data(data):
    data = data.replace('[^a-zA-Z0-9]', ' ', regex = True)
    data['FullDescription'] = data['FullDescription'].map(lambda x: x.lower())
    data['LocationNormalized'] = data['LocationNormalized'].map(lambda x: x.lower())
    data['LocationNormalized'].fillna('nan', inplace = True)
    data['ContractTime'].fillna('nan', inplace = True)
    return data

data_train = prepare_data(data_train)
data_test = prepare_data(data_test)

tfidf = TfidfVectorizer(min_df = 5)
v_dict = DictVectorizer()
X_train_descr = tfidf.fit_transform(data_train['FullDescription'])
X_train_dict = v_dict.fit_transform(data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))

X_train_vect = hstack([X_train_descr, X_train_dict])
Y_train = data_train['SalaryNormalized']
ridge = Ridge(alpha = 1)
ridge.fit(X_train_vect, Y_train)

X_test_descr = tfidf.transform(data_test['FullDescription'])
X_test_dict = v_dict.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_vect = hstack([X_test_descr, X_test_dict])
Y_test = ridge.predict(X_test_vect)
print('{:0.2f} {:0.2f}'.format(Y_test[0], Y_test[1]))

56552.35 37190.87
