In [56]:
from sklearn import datasets
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
import math
from sklearn.metrics import roc_auc_score
from scipy.spatial.distance import euclidean
from functools import reduce
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack

In [57]:
df = pd.read_csv("salary-train.csv")
test = pd.read_csv("salary-test-mini.csv")
train = df
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [58]:
test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [59]:
test.shape

(2, 4)

In [60]:
train.FullDescription = train.FullDescription.apply(lambda s: s.lower())
train.LocationNormalized = train.LocationNormalized.apply(lambda s: s.lower())

test.FullDescription = test.FullDescription.apply(lambda s: s.lower())
test.LocationNormalized = test.LocationNormalized.apply(lambda s: s.lower())

In [61]:
df.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london ****k ****...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager// luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse / rgn nursing home for young...,sutton coldfield,,20355


In [62]:
train['FullDescription'] = train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
test['FullDescription'] = test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [63]:
vectorizer = TfidfVectorizer(min_df=5)
TfidFeatures_train = vectorizer.fit_transform(train.FullDescription)
TfidFeatures_test = vectorizer.transform(test.FullDescription)

TfidFeatures_train

<60000x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [64]:
train['LocationNormalized'].fillna('nan', inplace=True)
train['ContractTime'].fillna('nan', inplace=True)

In [65]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_train_categ

<60000x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 120000 stored elements in Compressed Sparse Row format>

In [66]:
X_train = hstack([TfidFeatures_train, X_train_categ])
X_test = hstack([TfidFeatures_test, X_test_categ])

In [67]:
clf = Ridge(alpha=1.0, random_state=241)
clf.fit(X_train, df.SalaryNormalized)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [69]:
print(X_train.shape)
print(X_test.shape)
pred = clf.predict(X_test)
pred

(60000, 24627)
(2, 24627)


array([ 56555.61500155,  37188.32442618])

In [71]:
with open("liner_regress_res.txt", "w") as f:
    print("%.2f %.2f" % (pred[0], pred[1]), file=f, end='')