In [79]:
import numpy as np
import pandas as pd
import csv

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

In [80]:
train = pd.read_csv("salary-train.csv") 
test = pd.read_csv("salary-test-mini.csv") 

In [81]:
train[0:10]

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355
5,Sales and Marketing Assistant will provide adm...,Crawley,,22500
6,Vacancy Ladieswear fashion Area Manager / Regi...,UK,permanent,32000
7,Reference: LR/JAN/**** Our client is one of th...,Bristol,permanent,30000
8,Sponsorship Manager London The Company A marke...,Central London,permanent,31500
9,"About Barclays Barclays moves, lends, invests ...",South East London,permanent,42499


In [82]:
test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [83]:
train['FullDescription'] = train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
test['FullDescription'] = test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

train['FullDescription'] = train['FullDescription'].str.lower()
test['FullDescription'] = test['FullDescription'].str.lower()

In [84]:
#use tf idf
tfidf = TfidfVectorizer(min_df = 5)
train_description = tfidf.fit_transform(train['FullDescription'])
test_description = tfidf.transform(test['FullDescription'])

In [85]:
train['LocationNormalized'].fillna('nan', inplace=True)
train['ContractTime'].fillna('nan', inplace=True)

In [86]:
#one-hot кодирование
enc_location = DictVectorizer()
enc_contract = DictVectorizer()

train_location = enc_location.fit_transform(train[['LocationNormalized']].to_dict('records'))
train_contract = enc_contract.fit_transform(train[['ContractTime']].to_dict('records'))
                                                  
test_location = enc_location.transform(test[['LocationNormalized']].to_dict('records'))
test_contract = enc_contract.transform(test[['ContractTime']].to_dict('records'))

In [87]:
#объединяем предыдущие шаги в матрицу признаков
train_X = hstack([train_description, train_location, train_contract])

#создаем целевую матрицу
train_y = train['SalaryNormalized']

#создаем матрицу признаков тестовой выборки
test_X = hstack([test_description, test_location, test_contract])

In [88]:
#инициализируем классификатор и обучаем
clf = Ridge(alpha = 1, random_state = 241)
clf.fit(train_X, train_y)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [89]:
#делаем прогноз
clf.predict(test_X)

array([56555.61500155, 37188.32442618])