In [46]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack

# 1. upload data
data_train = pd.read_csv('salary-train.csv')
data_test = pd.read_csv('salary-test-mini.csv')

# 2.1 change job descriptions to lowercase
data_train['FullDescription'] = data_train['FullDescription'].str.lower()
data_test['FullDescription'] = data_test['FullDescription'].str.lower()

# 2.2 replace special symbols in job descriptions to spacebar
data_train['FullDescription'] = data_train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
data_test['FullDescription'] = data_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

# 2.3 TfidfVectorizer, and clear data, we leave just 5 top words in all text's set
vectorizer = TfidfVectorizer(min_df=5)
X_train_vect = vectorizer.fit_transform(data_train['FullDescription'])
X_test_vect = vectorizer.transform(data_test['FullDescription'])

# 2.4 replace NaN value to string 'nan'
data_train['LocationNormalized'].fillna('nan', inplace=True)
data_train['ContractTime'].fillna('nan', inplace=True)

# 2.5 next, we use DictVectorizer to define categories's strings like an integer, result will be a dict
enc = DictVectorizer()
X_train_categ = enc.fit_transform(data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

# 2.6 union calculated attributes, note that we created sparse matrices
X_train = hstack([X_train_vect, X_train_categ])
X_test = hstack([X_test_vect, X_test_categ])
y_train = data_train['SalaryNormalized']

# 3. Ridge

clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)

# 4. Result

y_test = clf.predict(X_test)
print(np.round(y_test,2))

[56557.44 37205.61]
