In [1]:
import pandas as pd
import numpy as np
import random
import re

from math import log

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

import matplotlib
import matplotlib.pylab as plt
%matplotlib inline

Load thejobs dataset

In [2]:
df = pd.read_csv('jobs.csv')

Shuffle Dataset and copy subset of dataset for IDF calculation

In [3]:
df = df.sample(frac=1)
df_train_tfidf = df[:1200].copy()

Split job descriptions into words

In [4]:
finder = re.compile('\w\w+')

corpus = []

for title in df_train_tfidf['Title']:
    corpus.append(finder.findall(title.lower()))

Define function to translate list of words to a dictionary

In [5]:
def list_2_dict(doc):
    d = {}
    words = set(doc)
    for word in words:
        d[word] = doc.count(word)
    return d

Translate job descriptions into dictionary of words

In [6]:
corpus = [list_2_dict(doc) for doc in corpus]
words = reduce(lambda a,b: a|b, [set(doc.keys()) for doc in corpus])

Print the total number of distinct words in the training set vocabulary

In [7]:
print('Vocabulary set length: ' + str(len(words)))

Vocabulary set length: 1361


Define function for computing IDF and returning N words with highest IDF

In [8]:
def compute_idf(corpus, words, min_frequency=3, n=30):
    idf = {}
    for k, word in enumerate(words):
        d = 0
        for doc in corpus:
            if word in doc:
                d = d+1
        if d>=min_frequency:
            idf[word] = log(float(len(corpus))/(1+d))
    v = [[idf[word], word] for word in idf]
    v.sort()
    
    #Print Top & Bottom 10 Words
    print("Top 10 Words:")
    print(pd.DataFrame(data=v[-10:], columns=['IDF','Word']))
    print("Bottom 10 Words:")
    print(pd.DataFrame(data=v[:10], columns=['IDF','Word']))
    
    top_idf = idf.copy()
    top_idf = {k : v for k,v in top_idf.iteritems() if v >= max(top_idf.values())}
    
    # If >= N words with max IDF, select random N subset
    if len(top_idf) >= n:
        top_idf_keys = random.sample(list(top_idf), n)
        words = top_idf_keys
    # Else select top N
    else:
        words = [word for word_idf, word in v[-n:]]
    
    return idf, words

In [9]:
idf, words = compute_idf(corpus, words, 20, 50)

Top 10 Words:
        IDF            Word
0  3.831980       paralegal
1  3.871201       executive
2  3.912023       assurance
3  3.954583     maintenance
4  3.954583      production
5  3.954583            york
6  3.999034       developer
7  3.999034  superintendent
8  4.045554         atlanta
9  4.045554              sr
Bottom 10 Words:
        IDF        Word
0  0.051293         job
1  0.053049          in
2  1.215146     manager
3  2.148434      dallas
4  2.286056    engineer
5  2.525729       sales
6  2.525729      senior
7  2.601440     quality
8  2.671236  accountant
9  2.733368  restaurant


Generate features for dataset - tf-idf for top 50 words

In [10]:
corpus = []

for title in df['Title']:
    corpus.append(finder.findall(title.lower()))
    
corpus = [list_2_dict(doc) for doc in corpus]

X = np.ndarray((len(corpus),len(words)))

X_words =[]

for i in range(len(corpus)):
    doc = corpus[i]
    for j in range(len(words)):
        word = words[j]
        idf_word = idf[word]
        tf = doc.get(word, 0)        
        feature = tf * idf_word
        X[i,j] = feature
        
for j in range(len(words)): 
    X_words.append(words[j])
    
X  = pd.DataFrame(data=X, columns=X_words)  

In [11]:
y = df['Salary'].copy()

In [12]:
X_train, y_train = X[:1200], y[:1200]
X_test, y_test = X[1200:], y[1200:]

Create linear regression pipeline, fit and predict against dataset

In [13]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Calculate and print RMSE

In [14]:
RMSE = (((y_test - y_pred)**2).sum()/len(y_test))**0.5
print RMSE

17814.8755071


Create a dataframe of the words and their corresponding model coefficients. 
Sort by coefficient value in descending order.

Words associated to the highest salaries have the largest positive coefficients while words associated to the lowest salaries have the largest negative coefficients

In [15]:
words_series = pd.Series(X_words, name='Word')
coef_series = pd.Series(model.coef_.tolist(), name='Coefficient')
model_coefficients  = pd.DataFrame(pd.concat([words_series, coef_series], axis=1))
model_coefficients.sort_values('Coefficient', ascending=False)

Unnamed: 0,Word,Coefficient
33,vegas,2.684912e+16
7,accountant,5041.252
38,automotive,2831.048
47,superintendent,2714.624
24,new,2603.108
11,analyst,2564.242
6,quality,2507.591
46,developer,1900.633
44,production,1839.495
41,executive,1767.626
