In [1]:
%matplotlib inline

In [2]:
import parser
import pandas
import unicodedata
import re
import leveldb
from tqdm import tqdm_notebook as tqdm
import json
import bm25
import sklearn
import scipy
import collections
from IPython.display import display

In [3]:
db = leveldb.LevelDB('indeed.leveldb')

In [4]:
tags_train = []
descs_train = []
with open('indeed_ml_dataset/train.tsv') as f:
    first = True
    for line in f:
        if first:
            first = False
            continue
        tag, desc = parser.split_tag_and_description(line)
        tags_train.append(tag)
        descs_train.append(desc)

In [5]:
descs_test = []
with open('indeed_ml_dataset/test.tsv') as f:
    first = True
    for line in f:
        if first:
            first = False
            continue
        descs_test.append(line)

In [6]:
pandas.DataFrame(tags_train)

Unnamed: 0,1-year-experience-needed,2-4-years-experience-needed,5-plus-years-experience-needed,associate-needed,bs-degree-needed,full-time-job,hourly-wage,licence-needed,ms-or-phd-needed,part-time-job,salary,supervising-job
0,False,False,True,False,False,False,False,True,False,False,False,True
1,False,True,False,False,False,True,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,True,False,False,False,False
4,False,False,True,False,True,True,False,False,False,False,False,True
5,False,False,True,True,False,False,False,False,False,False,False,False
6,False,False,True,False,False,True,False,False,True,False,False,False
7,False,False,True,False,True,False,False,False,False,False,False,False
8,False,True,False,False,True,False,False,False,False,False,False,False
9,False,False,False,False,False,True,True,False,False,False,False,False


In [7]:
def clean(s):
    # apply unicode normalization
    s = unicodedata.normalize('NFKC', s)
    # remove undesireble characters
    blocks = [
        '\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
        '\u3040-\u309F',  # HIRAGANA
        '\u30A0-\u30FF',  # KATAKANA
        '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
    ]
    s = re.sub('\u2010', '-', s)
    s = re.sub('[\xad\u202A\u200B\u202C\uFEFF\uFFFC\uFFFD{}\\s]+'.format(''.join(blocks)), ' ', s)
    # ignore invalid characters
    t = ''
    for c in s:
        try:
            unicodedata.name(c)
            t += c
        except ValueError:
            pass
    s = t
    s = s.strip()
    return s

# def clean(s):
#     s = unicodedata.normalize('NFKC', s)
#     s = re.sub('[\u0000-\u0020\u007F-\uFFFF]+', ' ', s)
#     return s.strip()

In [8]:
chrs = set()
for desc in descs_train:
    for c in clean(desc):
        if ord(c) <= 0x19 or ord(c) >= 0x7f:
            chrs.add(c)
chrs = sorted(list(chrs))
for c in chrs:
    print(repr(c), hex(ord(c)))

'§' 0xa7
'©' 0xa9
'«' 0xab
'®' 0xae
'°' 0xb0
'·' 0xb7
'»' 0xbb
'Í' 0xcd
'Ñ' 0xd1
'Ó' 0xd3
'á' 0xe1
'â' 0xe2
'ã' 0xe3
'ç' 0xe7
'é' 0xe9
'ê' 0xea
'í' 0xed
'ñ' 0xf1
'ó' 0xf3
'ú' 0xfa
'œ' 0x153
'́' 0x301
'̈' 0x308
'–' 0x2013
'—' 0x2014
'‘' 0x2018
'’' 0x2019
'“' 0x201c
'”' 0x201d
'•' 0x2022
'⁄' 0x2044
'€' 0x20ac
'▪' 0x25aa
'►' 0x25ba
'●' 0x25cf
'◦' 0x25e6
'✓' 0x2713
'✔' 0x2714


In [9]:
def has_item(db, path):
    if isinstance(path, str):
        path = path.encode('UTF-8')
    try:
        db.Get(path)
        return True
    except KeyError:
        return False

# for i, desc in tqdm(enumerate(descs_train), total=len(descs_train)):
#     desc = clean(desc)
#     path = f'train/{i}/tokenize'
#     if not has_item(db, path):
#         ts = parser.tokenize(desc)
#         db.Put(path.encode('UTF-8'), json.dumps(ts).encode('UTF-8'))

# for i, desc in tqdm(enumerate(descs_test), total=len(descs_test)):
#     desc = clean(desc)
#     path = f'test/{i}/tokenize'
#     if not has_item(db, path):
#         ts = parser.tokenize(desc)
#         db.Put(path.encode('UTF-8'), json.dumps(ts).encode('UTF-8'))

for i, desc in tqdm(enumerate(descs_train), total=len(descs_train)):
    desc = clean(desc)
    path = f'train/{i}/lemmatize'
    if not has_item(db, path):
        ls = parser.lemmatize(desc)
        ls = list(map(lambda xs: list(map(lambda x: x.lower(), xs)), ls))
        db.Put(path.encode('UTF-8'), json.dumps(ls).encode('UTF-8'))
        
for i, desc in tqdm(enumerate(descs_test), total=len(descs_test)):
    desc = clean(desc)
    path = f'test/{i}/lemmatize'
    if not has_item(db, path):
        ls = parser.lemmatize(desc)
        ls = list(map(lambda xs: list(map(lambda x: x.lower(), xs)), ls))
        db.Put(path.encode('UTF-8'), json.dumps(ls).encode('UTF-8'))







In [10]:
documents = []
for i, desc in enumerate(descs_train):
    path = f'train/{i}/lemmatize'
    sents = json.loads(db.Get(path.encode('UTF-8')))
    x = []
    for sent in sents:
#         for w in sent:
#             if w in support:
#                 x.append(w)
        x.extend(sent)
    documents.append(x)
tf_list = []
for document in documents:
    tf = collections.Counter(document)
    tf_list.append(tf)
dict_vectorizer = sklearn.feature_extraction.DictVectorizer(sort=False)
X_train = dict_vectorizer.fit_transform(tf_list)
y_train = pandas.DataFrame(tags_train)
sklearn.externals.joblib.dump(X_train, 'X_train.pkl')
sklearn.externals.joblib.dump(y_train, 'y_train.pkl')

['y_train.pkl']

In [40]:
def predict_train(clf, params, X_train, y_train):
    y_pred = pandas.DataFrame(columns=y_train.columns)
    for column in y_train.columns:
        cv = sklearn.model_selection.RandomizedSearchCV(clf, params, n_iter=10, scoring='f1', cv=5, n_jobs=-1)
        cv.fit(X_train, y_train[column])
        y_pred[column] = sklearn.model_selection.cross_val_predict(cv.best_estimator_, X_train, y_train[column], cv=5, n_jobs=-1)
    return y_pred

def f1score(y_true, y_pred):
    STN = 0
    SFN = 0
    SFP = 0
    STP = 0
    for column in y_true.columns:
        cm = sklearn.metrics.confusion_matrix(y_train[column], y_pred[column])
        STN += cm[0,0]
        SFN += cm[1, 0]
        SFP += cm[0, 1]
        STP += cm[1, 1]
    P = STP / (STP + SFP)
    R = STP / (STP + SFN)
    F1 = 2*P*R/(P+R)
    return F1

In [41]:
# clf = sklearn.naive_bayes.MultinomialNB()
# for i in range(len(y_train.columns)):
#     print(i, sklearn.model_selection.cross_val_score(clf, X_train, y_train.iloc[:,i], scoring='f1', cv=5, n_jobs=-1))

clf = sklearn.linear_model.PassiveAggressiveClassifier()
params = {
    'loss': ['hinge', 'squared_hinge'],
    'C': 10**scipy.linspace(-10,10,1000),
}
y_pred = predict_train(clf, params, X_train, y_train)
f1score(y_train, y_pred)







  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'











  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)












  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)








0.43844966279353209

In [17]:
estms = sklearn.externals.joblib.load('estms.pkl')
documents = []
for i, desc in enumerate(descs_test):
    path = f'test/{i}/lemmatize'
    sents = json.loads(db.Get(path.encode('UTF-8')))
    x = []
    for sent in sents:
        x.extend(sent)
    documents.append(x)
tf_list = []
for document in documents:
    tf = collections.Counter(document)
    tf_list.append(tf)
X_test = dict_vectorizer.transform(tf_list)
y_test = pandas.DataFrame(columns=y_train.columns)
X_test.shape
for i, column in enumerate(y_test.columns):
    estms[i].fit(X_train, y_train[column])
    y_test[column] = estms[i].predict(X_test)

In [18]:
with open('tags.tsv', 'w') as f:
    print('tags', file=f)

    for row in y_test.iterrows():
        tags = []
        for key, value in zip(row[1].index, row[1]):
            if value:
                tags.append(key)
        print(' '.join(tags), file=f)

In [28]:
display(y_train.columns)
dt = estms[4]
imp = list(zip(dt.feature_importances_, range(len(dt.feature_importances_))))
imp.sort(reverse=True)
for item in imp[:20]:
    if item[0] == 0:
        break
    print(item)
    print(dict_vectorizer.feature_names_[item[1]])

Index(['1-year-experience-needed', '2-4-years-experience-needed',
       '5-plus-years-experience-needed', 'associate-needed',
       'bs-degree-needed', 'full-time-job', 'hourly-wage', 'licence-needed',
       'ms-or-phd-needed', 'part-time-job', 'salary', 'supervising-job'],
      dtype='object')

(0.5051058098290987, 795)
bachelor
(0.12252048392463359, 104)
degree
(0.057115253604052056, 3958)
bs
(0.02659455422308141, 992)
school
(0.023878339278605949, 1467)
associates
(0.021401591840175697, 1297)
diploma
(0.02087872007003988, 2356)
b.s.
(0.015717035334094982, 1792)
ba
(0.015501799431600925, 2540)
ba/bs
(0.014729265593691056, 2606)
bachelors
(0.0099567124256269048, 111)
year
(0.0082353502052149516, 1362)
bs/ba
(0.0069001250213058469, 495)
project
(0.0061584074963758331, 2861)
science
(0.0056561294429603275, 4)
a
(0.0051635485370684761, 1497)
community
(0.0051438953325976975, 4381)
negotiate
(0.0046345308738853395, 6406)
b.a.
(0.0045877835368413828, 3321)
developer
(0.0045480649209406945, 1742)
tuition


In [14]:
del db

In [35]:
features = []
for i in range(12):
    selection = sklearn.feature_selection.SelectFromModel(estms[i], prefit=True)
    ind = selection.get_support(indices=True)
    features.extend(ind)
features = sorted(list(set(features)))
support = set()
for f in features:
    support.add(dict_vectorizer.feature_names_[f])