In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
def get_and_clean_data():
    data = pd.read_csv('./sw_dev_usa.csv')
    description = data['job_description']
    cleaned_description = description.apply(lambda s: s.translate(str.maketrans('', '', string.punctuation + u'\xa0')))
    cleaned_description = cleaned_description.apply(lambda s: s.lower())
    cleaned_description = cleaned_description.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' '*len(string.whitespace), '')))
    cleaned_description = cleaned_description.drop_duplicates()
    return cleaned_description

In [3]:
cleaned_description = get_and_clean_data()
cleaned_description.head()

0    the chosen sr software developer will be part ...
1    position c lead software developer location mi...
2    senior software developer hoboken nj starts as...
3    our client a multinational publishing and educ...
4    position c lead software developer location ph...
Name: job_description, dtype: object

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Bag of word

In [5]:
def preProcess(s):
    ps = PorterStemmer()
    s = word_tokenize(s)
    stopwords_set = set(stopwords.words())
    stop_dict = {s:1 for s in stopwords_set}
    s = [w for w in s if w not in stop_dict]
    s = [ps.stem(w) for w in s]
    s = ' '.join(s)
    return s

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
def sk_vectorize(cleaned_description):
    vectorizer = CountVectorizer(preprocessor=preProcess)
    vectorizer.fit_transform(cleaned_description)
    query = vectorizer.transform(['good at java and python'])
    print(query)
    print(vectorizer.inverse_transform(query))

In [7]:
sk_vectorize(cleaned_description)

  (0, 15571)	1
  (0, 18608)	1
  (0, 26294)	1
[array(['good', 'java', 'python'], dtype='<U182')]


In [8]:
vectorizer = CountVectorizer(preprocessor=preProcess, ngram_range=(1, 2))
x = vectorizer.fit_transform(cleaned_description)
print(vectorizer.get_feature_names())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## TF-IDF

In [15]:
n = 5
cleaned_description = cleaned_description[:n]
vectorizer = CountVectorizer(preprocessor=preProcess)
x = vectorizer.fit_transform(cleaned_description)
print(x.toarray())

[[0 1 1 ... 5 1 0]
 [1 0 0 ... 1 0 1]
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 1 0 1]]


In [17]:
idf = n / (x.tocoo() > 0).sum(0)
x.data = np.log10(x.data + 1)
x.data = x.multiply(np.log10(idf))

In [24]:
x.data.toarray()

array([[0.        , 0.0798834 , 0.0798834 , ..., 0.02422447, 0.0798834 ,
        0.        ],
       [0.04547949, 0.        , 0.        , ..., 0.01107558, 0.        ,
        0.04547949],
       [0.        , 0.        , 0.        , ..., 0.01641812, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04547949, 0.        , 0.        , ..., 0.01107558, 0.        ,
        0.04547949]])

In [25]:
print(pd.DataFrame(x.data.toarray(), columns=vectorizer.get_feature_names()))

     110000     18000      1983       250    300000        34       510  \
0  0.000000  0.079883  0.079883  0.079883  0.079883  0.000000  0.000000   
1  0.045479  0.000000  0.000000  0.000000  0.000000  0.000000  0.045479   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.079883  0.000000   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.045479  0.000000  0.000000  0.000000  0.000000  0.000000  0.045479   

      62304      8000  8882376835  ...    within   without  work     would  \
0  0.079883  0.079883    0.079883  ...  0.079883  0.045479   0.0  0.045479   
1  0.000000  0.000000    0.000000  ...  0.000000  0.000000   0.0  0.000000   
2  0.000000  0.000000    0.000000  ...  0.000000  0.045479   0.0  0.000000   
3  0.000000  0.000000    0.000000  ...  0.000000  0.000000   0.0  0.045479   
4  0.000000  0.000000    0.000000  ...  0.000000  0.000000   0.0  0.000000   

      write    writer       xml      year     yield      zaur  
0  0.037585  0.0

In [27]:
arr = np.array([[100, 90, 5], [200, 200, 200], [200, 300, 10], [50, 0, 200]])
arr = np.log10(arr+1)

DH = arr[:, 0]
CD = arr[:, 1]
DC = arr[:, 2]

print(np.dot(DH, CD))
print(np.dot(CD, DC))
print(np.dot(DH, DC))

14.939885194377618
9.410303606094942
13.195777686137449


In [28]:
norm_DH = DH/np.sqrt(np.sum(DH**2))
norm_CD = CD/np.sqrt(np.sum(CD**2))
norm_DC = DC/np.sqrt(np.sum(DC**2))

print(norm_CD, norm_DC, norm_DH)
print(np.transpose([norm_CD, norm_DC, norm_DH]))

[0.50107052 0.58909611 0.63395119 0.        ] [0.22188161 0.65673203 0.29694213 0.65673203] [0.47854338 0.54990145 0.54990145 0.40769231]
[[0.50107052 0.22188161 0.47854338]
 [0.58909611 0.65673203 0.54990145]
 [0.63395119 0.29694213 0.54990145]
 [0.         0.65673203 0.40769231]]


In [29]:
print(np.dot(norm_DH, norm_CD))
print(np.dot(norm_DH, norm_DC))
print(np.dot(norm_CD, norm_DC))

0.9123394651809295
0.8983513789958276
0.6863034317623423


## Builting tf-idf

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(preprocessor=preProcess)
x = vectorizer.fit_transform(cleaned_description)
print(pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names()))

     110000     18000      1983       250    300000        34       510  \
0  0.000000  0.045564  0.045564  0.045564  0.045564  0.000000  0.000000   
1  0.074481  0.000000  0.000000  0.000000  0.000000  0.000000  0.074481   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.081564  0.000000   
3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4  0.074785  0.000000  0.000000  0.000000  0.000000  0.000000  0.074785   

      62304      8000  8882376835  ...    within   without      work  \
0  0.045564  0.045564    0.045564  ...  0.045564  0.036761  0.065135   
1  0.000000  0.000000    0.000000  ...  0.000000  0.000000  0.043990   
2  0.000000  0.000000    0.000000  ...  0.000000  0.065805  0.155462   
3  0.000000  0.000000    0.000000  ...  0.000000  0.000000  0.043996   
4  0.000000  0.000000    0.000000  ...  0.000000  0.000000  0.044169   

      would     write    writer       xml      year     yield      zaur  
0  0.036761  0.061030  0.045564  0.000000 

## BM25

In [35]:
from scipy import sparse
class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)                                                          
        return (numer / denom).sum(1).A1

In [36]:
bm25 = BM25()
bm25.fit(cleaned_description)
print(bm25.transform('aws github', cleaned_description))

[0.         0.         1.89200257 2.11154135 0.        ]
