## Libraries

In [1]:
# Pandas --> Data Frames
import pandas as pd

# numpy --> scientific computing with Python
import numpy as np
np.set_printoptions(threshold=np.nan)

# matplotlib --> graphing library
from matplotlib import pyplot as plt
%matplotlib inline

# Seaborn --> makes matplotlib prettier
import seaborn as sb

# sklearn for linear regression and cross-validation
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.cross_validation import train_test_split

# statsmodels for linear regression
import statsmodels.api as sm

# statsmodels formula api
import statsmodels.formula.api as smf

# patsy dmatrices to create model matrix
from patsy import dmatrices
from patsy import ModelDesc, EvalEnvironment

# needed for cleaning nan's from dictionaries
from math import isnan

# Cross-validating models
from sklearn.cross_validation import cross_val_score, KFold

# Text Processing
from sklearn.feature_extraction.text import CountVectorizer
from  sklearn.feature_extraction import DictVectorizer


## Import Data

In [2]:
data_original = pd.read_csv("/Users/carriesmith/Dropbox/GA/datascience/salary_prediction/data/train.csv")
data = data_original

## Data Prep

### Lower Case

In [4]:
data.columns = map(str.lower, data.columns)
data['title'] = data['title'].str.lower()

### NaN to "Missing"

In [5]:
# required
cols = data.columns
data[cols[cols != 'salarynormalized']] = data[cols[cols != 'salarynormalized']].fillna("missing")

### Reducing Number of Categories

In [6]:
# required
def reduceCats(data, test_data, feature, top_n, verbose = False):
    val_counts = data[feature].value_counts(dropna = False)
    common_cats = val_counts.head(n = top_n).index.values
    data[feature + '_r'] = np.where( data[feature].isin(common_cats), data[feature], 'other')
    test_data[feature + '_r'] = np.where( test_data[feature].isin(common_cats), test_data[feature], 'other')
    if (verbose):
        print data[feature + '_r'].value_counts(dropna = False)
    return data, test_data

#### Managers

In [7]:
# required
data['manager_r'] = np.where( data['title'].str.contains("manager"), 1, 0)
data.loc[data['manager_r']==1,:].head()

Unnamed: 0,id,title,fulldescription,locationraw,locationnormalized,contracttype,contracttime,company,category,salaryraw,salarynormalized,sourcename,manager_r
29,25556432,"general manager funky, cool restaurant concep...","Senior General Manager – Funky, Cool, Casual R...",West London London South East,East Sheen,missing,missing,Bee Recruitment London Ltd,Hospitality & Catering Jobs,"From 28,000 to 35,000 per annum",31500,caterer.com,1
30,25892031,mice sales and marketing manager,"Our client, a national hotel chain is seeking ...",Buckinghamshire South East,Buckinghamshire,missing,missing,Chess Partnership,Hospitality & Catering Jobs,"30,000 per annum plus bonus and package",30000,caterer.com,1
34,27527077,business development manager,"The Company: Our client are a full service, cr...","Tyne Wear, North East",Newcastle Upon Tyne,missing,permanent,Asset Appointments,IT Jobs,18000 - 24000/annum plus commission OTE 50-60k,21000,cv-library.co.uk,1
48,29571506,"deputy manager nevill crest and gun, eridge g...",We are looking for an experienced Deputy Manag...,"Eridge Green, East Sussex",UK,missing,permanent,Brunning & Price,Travel Jobs,"Salary package around 24,000",24000,leisurejobs.com,1
63,31559397,field sales engineer / sales account manager ...,Due to expansion there is now an opportunity f...,"Burgess Hill, West Sussex",Burgess Hill,missing,permanent,MatchBox Recruiting Ltd,Sales Jobs,"40000 - 45000/annum 40-45k + bonus, travel, phone",42500,cv-library.co.uk,1


## Train Test Split

In [8]:
data, final_test = train_test_split(data, test_size = 0.2)

## Exploring Reducing the Max # Categories

We may gain a little extra predictive value in having more levels to categorical variables, but large sparce model matrices slow down model time significantly. As I intend to explore interactions between variables, cutting back the number of categories will keep this from getting cumbersome. There is a possibility that combinations (interactions) between dropped categories and other variables could be valuable, and doing this does risk losing visibility on those.

fig,axes = plt.subplots(ncols = 4)
no = 0
for feature in ('title', 'locationnormalized','company', 'sourcename'):
    print feature

    score_levels = []
    
    for top_n in range(5,200,5):
           
        if len(data[feature].value_counts(dropna = False).index.values) < top_n:
            break
        
        data, final_test = reduceCats(data, final_test, feature, top_n)

        formula = 'salarynormalized ~ ' + feature + '_r'
        y, X = dmatrices(formula, data = data, return_type = "dataframe")

        keep_score = []
        for _ in range(20):
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)
            model = LinearRegression(fit_intercept=False)
            model.fit(X_train,y_train)
            keep_score.append(model.score(X_test,y_test))

        score_levels.append(np.mean(keep_score))
    
    f = pd.DataFrame(score_levels, index = range(5, 5*len(score_levels)+5, 5)).plot(ylim=(-.05,.40), ax=axes[no], legend=False, figsize = (16,4))
    f = axes[no].set_title(feature)
    f = axes[no].set_xlabel('degree'), axes[no].set_ylabel('$R^2$')
    
    no+=1

## Build Models

In [9]:
def form_builder(keep):
    formula = "salarynormalized ~ "
    if (len(keep) == 1):
        formula = formula + keep[0]
    else:
        formula = formula + keep[0]
        for feature in keep[1:]:
            formula = formula + ' + ' + feature
    return formula

### One Variable Forward Selection

### Forward Selection with all 2-Way Interactions

In [10]:
ex_feats =  ['title_r','contracttype', 'contracttime', 'category', 'manager_r', \
    'locationnormalized_r', 'company_r', 'sourcename_r']

def form_builder_all(features, degree):
    formula = "salarynormalized ~ ( " + features[0]
    
    for feature in features[1:]:
        formula = formula + " + " + feature
        
    formula = formula + ") ** " + str(degree)
    
    return formula

form_builder_all(ex_feats, 2)

'salarynormalized ~ ( title_r + contracttype + contracttime + category + manager_r + locationnormalized_r + company_r + sourcename_r) ** 2'

In [11]:
%%time

formula = 'salarynormalized ~ ( sourcename_r + locationnormalized_r + manager_r + category + contracttime + company_r) ** 2'
print formula

y, X = dmatrices(formula, data=data, return_type='dataframe')

print 'Mean Absolute Error: ', cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='mean_absolute_error').mean()
print 'R^2: ', cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='r2').mean()

salarynormalized ~ ( sourcename_r + locationnormalized_r + manager_r + category + contracttime + company_r) ** 2


NameError: name 'sourcename_r' is not defined

In [66]:
%%time

print cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='mean_absolute_error')
print cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='r2')

[-8587.96548753 -8960.98954898 -8440.57337022 -8256.53953192 -8557.46679567
 -9087.89725554]
[ 0.39609285  0.38644629  0.32915627  0.37656613  0.34979594  0.32712661]
CPU times: user 10min 36s, sys: 11.5 s, total: 10min 47s
Wall time: 3min 20s


## Explore Tweaking # Levels for Locations, Company and Source

Didn't really have time to run this section.

In [68]:
formula = 'salarynormalized ~ ( sourcename_r + locationnormalized_r + manager_r + category + contracttime + company_r) ** 2'

for levs in range(20, 100, 10):
    data, final_test = reduceCats(data, final_test, 'locationnormalized', levs, verbose=False)
    y, X = dmatrices(formula, data=data, return_type='dataframe')
    print 'Levels ', levs, '   Mean Absolute Error: ', cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='mean_absolute_error').mean()

 Levels  20    Mean Absolute Error:  -8699.85956048
Levels  30    Mean Absolute Error:  -8672.28889365
Levels  40    Mean Absolute Error:  -8643.28716359
Levels  50    Mean Absolute Error:  -8640.12080561
Levels  60    Mean Absolute Error:  -8648.57199831
Levels  70    Mean Absolute Error:  -8635.69071585
Levels  80    Mean Absolute Error:  -8633.73541311
Levels  90    Mean Absolute Error:  -8630.23032584
Levels  100    Mean Absolute Error:  -8633.01549499
Levels  110    Mean Absolute Error:  -8637.44610502


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [69]:
data, final_test = reduceCats(data, final_test, 'locationnormalized', 80, verbose=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
for n_loc in range(20, 200, 20):
    data, final_test = reduceCats(data, final_test, 'company', levs, verbose=False)
    y, X = dmatrices(formula, data=data, return_type='dataframe')
    print 'Levels ', levs, '   Mean Absolute Error: ', cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='mean_absolute_error').mean()

In [None]:
for n_loc in range(10, 40, 5):
    data, final_test = reduceCats(data, final_test, 'source', levs, verbose=False)
    y, X = dmatrices(formula, data=data, return_type='dataframe')
    print 'Levels ', levs, '   Mean Absolute Error: ', cross_val_score(Ridge(alpha=10), X, y, cv=6, scoring='mean_absolute_error').mean()

## Tweaking Ridge Alpha

In [74]:
data, final_test = reduceCats(data, final_test, 'locationnormalized', 80, verbose=False)
data, final_test = reduceCats(data, final_test, 'company', 60, verbose=False)
data, final_test = reduceCats(data, final_test, 'sourcename', 20, verbose=False)

formula = 'salarynormalized ~ ( sourcename_r + locationnormalized_r + manager_r + category + contracttime + company_r) ** 2'

for alpha_exp in range(-2, 4):
    y, X = dmatrices(formula, data=data, return_type='dataframe')
    alpha = 10 ** alpha_exp
    print 'Alpha: ', alpha, '   Mean Absolute Error: ', cross_val_score(Ridge(alpha=alpha), X, y, cv=6, scoring='mean_absolute_error').mean()

 Alpha:  0.01    Mean Absolute Error:  -10042.1991796
Alpha:  0.1    Mean Absolute Error:  -9484.64025919
Alpha:  1    Mean Absolute Error:  -8888.36244308
Alpha:  10    Mean Absolute Error:  -8633.73541311
Alpha:  100    Mean Absolute Error:  -8850.81653232
Alpha:  1000    Mean Absolute Error:  -9629.72003758


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Getting Crazy 3-Way Interactions

These don't seem to be helping anything.

## Count Vectorizer Title and Full Description

In [15]:
countVec = CountVectorizer( stop_words = 'english' , ngram_range=(1,3), 
                     max_features=100, # number of features
                     min_df=.01, # only use words that appear at least some times (integer = absolute count)
                     max_df=.95)  # ignore words that appear too frequently

X_title, y = countVec.fit_transform(data.title).todense(), data.salarynormalized
print countVec.get_feature_names()

[u'account', u'administrator', u'advisor', u'analyst', u'assistant', u'business', u'care', u'care assistant', u'care worker', u'chef', u'chef partie', u'commercial', u'consultant', u'coordinator', u'customer', u'deputy', u'deputy manager', u'design', u'developer', u'development', u'engineer', u'executive', u'financial', u'general', u'health', u'home', u'home manager', u'hotel', u'job', u'jobs', u'lead', u'leader', u'live', u'london', u'maintenance', u'manager', u'mechanical', u'nurse', u'nurse rgn', u'nursing', u'nursing home', u'officer', u'partie', u'practitioner', u'project', u'project manager', u'quality', u'recruitment', u'recruitment consultant', u'registered', u'registered nurse', u'required', u'rgn', u'rgn rmn', u'rmn', u'sales', u'sales executive', u'senior', u'service', u'services', u'social', u'social worker', u'software', u'staff', u'staff nurse', u'support', u'support worker', u'teacher', u'team', u'technical', u'technician', u'web', u'worker', u'worker job', u'workers']


In [16]:
print 'Mean Absolute Error: ', cross_val_score(Ridge(alpha=10), X_title, y, cv=10, scoring='mean_absolute_error').mean()

Mean Absolute Error:  -9288.97376724


## Combining Both

In [12]:
data, final_test = reduceCats(data, final_test, 'locationnormalized', 60, verbose=False)
data, final_test = reduceCats(data, final_test, 'company', 60, verbose=False)
data, final_test = reduceCats(data, final_test, 'sourcename', 20, verbose=False)

df = pd.concat([data, final_test])

print data.shape
print final_test.shape
print df.shape

(8000, 16)
(2000, 16)
(10000, 16)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
formula = 'salarynormalized ~ ( sourcename_r + locationnormalized_r + manager_r + category + contracttime + company_r) ** 2'
y, X = dmatrices(formula, data=df, return_type='dataframe')

In [31]:
X_train = X.loc[range(0,8000),:]
y_train = y.loc[range(0,8000),:]

X_all_train = np.hstack([X_title, X_train])

model = Ridge(alpha=10)
model.fit(X_all_train, y_train)

# # X_test_all = hstack(X_test_title, X_test)
# # print 'Mean Absolute Error: ', cross_val_score(Ridge(alpha=10), X_test_all, y, cv=10, scoring='mean_absolute_error').mean()

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [50]:
print X_title.shape
print X_train.shape
print X_all_train.shape

(8000, 75)
(8000, 10454)
(8000, 10529)


In [55]:
X_test = X.loc[range(8000,10000),:]
y_test = y.loc[range(8000,10000),:]

X_test_title = countVec.transform(final_test.title).todense()

In [56]:
print X_test_title.shape
print X_test.shape

(2000, 75)
(2000, 10454)


In [57]:
X_test_all = np.hstack([X_test_title, X_test])

In [58]:
model.score(X_test_all, y_test)

0.35986031062692991

In [62]:
abs(model.predict(X_test_all)-y_test).sum()/len(y_test)

salarynormalized    9914.564988
dtype: float64

Well, that isn't good at all.