In [1]:
# Pandas --> Data Frames
import pandas as pd

# numpy --> scientific computing with Python
import numpy as np
np.set_printoptions(threshold=np.nan)

# matplotlib --> graphing library
from matplotlib import pyplot as plt
%matplotlib inline

# Seaborn --> makes matplotlib prettier
import seaborn as sb

# sklearn for linear regression and cross-validation
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.cross_validation import train_test_split

# statsmodels for linear regression
import statsmodels.api as sm

# statsmodels formula api
import statsmodels.formula.api as smf

# patsy dmatrices to create model matrix
from patsy import dmatrices
from patsy import ModelDesc, EvalEnvironment

# needed for cleaning nan's from dictionaries
from math import isnan

# Cross-validating models
from sklearn.cross_validation import cross_val_score, KFold

# Text Processing
from sklearn.feature_extraction.text import CountVectorizer
from  sklearn.feature_extraction import DictVectorizer

In [9]:
data_original = pd.read_csv("/Users/carriesmith/Dropbox/GA/datascience/salary_prediction/data/train.csv")
data = data_original

data.columns = map(str.lower, data.columns)
data['title'] = data['title'].str.lower()

# required
cols = data.columns
data[cols[cols != 'salarynormalized']] = data[cols[cols != 'salarynormalized']].fillna("missing")

countVec = CountVectorizer( stop_words = 'english' , ngram_range=(1,3), 
                     max_features=100, # number of features
                     min_df=.01, # only use words that appear at least some times (integer = absolute count)
                     max_df=.95)  # ignore words that appear too frequently

X, y = countVec.fit_transform(data.title).todense(), data.salarynormalized
print countVec.get_feature_names()

[u'account', u'administrator', u'advisor', u'analyst', u'assistant', u'business', u'care', u'care assistant', u'care worker', u'chef', u'chef partie', u'commercial', u'consultant', u'customer', u'deputy', u'deputy manager', u'design', u'design engineer', u'developer', u'development', u'engineer', u'executive', u'general', u'health', u'home', u'home manager', u'hotel', u'job', u'jobs', u'lead', u'leader', u'live', u'london', u'maintenance', u'manager', u'mechanical', u'nurse', u'nursing', u'nursing home', u'officer', u'partie', u'practitioner', u'project', u'project manager', u'quality', u'recruitment', u'recruitment consultant', u'registered', u'registered nurse', u'required', u'rgn', u'rgn rmn', u'rmn', u'sales', u'sales executive', u'senior', u'service', u'services', u'social', u'social worker', u'software', u'staff', u'staff nurse', u'support', u'support worker', u'teacher', u'team', u'technical', u'technician', u'web', u'worker', u'worker job', u'workers']


In [13]:
for alpha in [1,3,10,30,100,300,1000,3000,10000]:
    print "alpha %10.4f  OLS %.4f Lasso %.4f Ridge %.4f" % \
        (alpha, 
         -cross_val_score(LinearRegression(), X, y, cv=10, scoring='mean_absolute_error').mean(),
         -cross_val_score(Lasso(alpha), X, y, cv=10, scoring='mean_absolute_error').mean(), 
         -cross_val_score(Ridge(alpha), X, y, cv=10, scoring='mean_absolute_error').mean())

alpha     1.0000  OLS 9438.0864 Lasso 9435.8421 Ridge 9435.3113
alpha     3.0000  OLS 9438.0842 Lasso 9433.0650 Ridge 9432.4581
alpha    10.0000  OLS 9438.0847 Lasso 9418.2906 Ridge 9430.4209
alpha    30.0000  OLS 9438.0864 Lasso 9497.7535 Ridge 9457.7983
alpha   100.0000  OLS 9438.0846 Lasso 10077.5488 Ridge 9613.2979
alpha   300.0000  OLS 9438.0842 Lasso 10899.3512 Ridge 9985.7530
alpha  1000.0000  OLS 9438.0851 Lasso 11755.4470 Ridge 10643.6871
alpha  3000.0000  OLS 9438.0855 Lasso 11853.8843 Ridge 11233.5997
alpha 10000.0000  OLS 9438.0851 Lasso 11853.8843 Ridge 11621.4967


In [6]:
countVec = CountVectorizer( stop_words = 'english' , ngram_range=(1,3), 
                     max_features=100, # number of features
                     min_df=.1, # only use words that appear at least some times (integer = absolute count)
                     max_df=.95)  # ignore words that appear too frequently

X, y = countVec.fit_transform(data.fulldescription).todense(), data.salarynormalized
print countVec.get_feature_names()

[u'ability', u'able', u'apply', u'area', u'available', u'based', u'benefits', u'business', u'candidate', u'candidates', u'care', u'career', u'client', u'clients', u'communication', u'company', u'contact', u'currently', u'customer', u'cv', u'day', u'days', u'design', u'develop', u'development', u'employment', u'engineering', u'ensure', u'environment', u'essential', u'excellent', u'experience', u'experience working', u'experienced', u'general', u'good', u'health', u'high', u'home', u'hours', u'ideally', u'include', u'including', u'industry', u'information', u'job', u'join', u'key', u'knowledge', u'leading', u'level', u'looking', u'management', u'manager', u'need', u'needs', u'new', u'nurse', u'nursing', u'nursing home', u'offer', u'opportunities', u'opportunity', u'people', u'position', u'professional', u'project', u'projects', u'provide', u'qualified', u'quality', u'recruitment', u'registered', u'required', u'requirements', u'responsible', u'rgn', u'role', u'salary', u'sales', u'senior'

In [8]:
for n in xrange(-3, 5):
    alpha = 10 ** n
    print "alpha %10.4f  OLS %.4f Lasso %.4f Ridge %.4f" % \
        (alpha, 
         -cross_val_score(LinearRegression(), X, y, cv=10, scoring='mean_absolute_error').mean(),
         -cross_val_score(Lasso(alpha), X, y, cv=10, scoring='mean_absolute_error').mean(), 
         -cross_val_score(Ridge(alpha), X, y, cv=10, scoring='mean_absolute_error').mean())

alpha     0.0010  OLS 9957.5947 Lasso 9957.5933 Ridge 9957.5945
alpha     0.0100  OLS 9957.5947 Lasso 9957.5813 Ridge 9957.5928
alpha     0.1000  OLS 9957.5947 Lasso 9957.4607 Ridge 9957.5756
alpha     1.0000  OLS 9957.5947 Lasso 9956.2592 Ridge 9957.4046
alpha    10.0000  OLS 9957.5947 Lasso 9944.8019 Ridge 9955.7088
alpha   100.0000  OLS 9957.5947 Lasso 9891.1458 Ridge 9940.0381
alpha  1000.0000  OLS 9957.5947 Lasso 10598.0135 Ridge 9861.9096
alpha 10000.0000  OLS 9957.5947 Lasso 11853.8843 Ridge 10070.8745
