## IMPORT Library

In [6]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse

## Preparing

In [7]:
job = input('enter job description: ').split(' ')
def get_and_clean_data():
    data = pd.read_csv('./sw_dev_usa.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

enter job description: job code cloud aws dog


In [8]:
n = 5
cleaned_description = get_and_clean_data()
cleaned_description = cleaned_description[:n]
cleaned_description.head()

0    the chosen sr software developer will be part ...
1    position c lead software developer location mi...
2    senior software developer hoboken nj starts as...
3    our client a multinational publishing and educ...
4    position c lead software developer location ph...
Name: job_description, dtype: object

In [9]:
def preProcess(s):
    ps = PorterStemmer()
    s = word_tokenize(s)
    stopwords_set = set(stopwords.words())
    stop_dict = {s:1 for s in stopwords_set}
    s = [w for w in s if w not in stop_dict]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

In [10]:
def displayTopFive(df):
    df_dict = df.to_dict()
    tmp = dict()
    for item in df_dict:
        tmp[item] = df_dict[item][0]
        for ind in df_dict[item]:
            tmp[item] += df_dict[item][ind]
    return sorted(tmp.items(), key=lambda x: x[1], reverse=True)[:5]

## TF Ranking

In [11]:
vectorizer = CountVectorizer(preprocessor=preProcess, ngram_range=(1, 2))
vectorizer.fit_transform(cleaned_description)
x = vectorizer.transform(job)
print(x.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [12]:
x.data = np.log10(x.data + 1)
print(x.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
tf = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())
tf.head()

Unnamed: 0,110000,110000 excel,18000,18000 client,1983,1983 aerotek,250,250 nonfranchis,300000,300000 contract,...,year aerotek,year current,year experi,year nodej,year profession,year relev,yield,yield competit,zaur,zaur xml
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
displayTopFive(tf)

[('job', 0.6020599913279624),
 ('aw', 0.3010299956639812),
 ('code', 0.3010299956639812),
 ('110000', 0.0),
 ('110000 excel', 0.0)]

## TF-IDF

In [15]:
vectorizer.fit(cleaned_description)
x = vectorizer.transform(job)
idf = n / (x.tocoo() > 0).sum(0)
x.data = np.log10(x.data + 1)
x.data = x.multiply(np.log10(idf))
x.data.toarray()

  idf = n / (x.tocoo() > 0).sum(0)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
tfidf = pd.DataFrame(x.data.toarray(), columns=vectorizer.get_feature_names())
tfidf.head()

Unnamed: 0,110000,110000 excel,18000,18000 client,1983,1983 aerotek,250,250 nonfranchis,300000,300000 contract,...,year aerotek,year current,year experi,year nodej,year profession,year relev,yield,yield competit,zaur,zaur xml
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
displayTopFive(tfidf)

[('job', 0.42082187474904936),
 ('aw', 0.21041093737452468),
 ('code', 0.21041093737452468),
 ('110000', 0.0),
 ('110000 excel', 0.0)]

## BM25

In [18]:
class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [19]:
bm25 = BM25()
bm25.fit(cleaned_description)
bm25.transform(''.join(w for w in job), cleaned_description)

array([0., 0., 0., 0., 0.])