In [18]:
# Problem statement: https://www.hackerrank.com/contests/indeed-ml-codesprint-2017/challenges/tagging-raw-job-descriptions
## Tagging raw job description from 12 tags

## NLP Pipeline: sentence->word->stemming->stop words->count_vectorizer->tfidf->XGBBoost

import pandas as pd
import nltk, re
from nltk import word_tokenize
from collections import defaultdict

df_train = pd.read_csv('train.tsv', sep='\t')
# print df_train.head()
print df_train.describe()
df_test = pd.read_csv('test.tsv',sep='\t')
# df_test.head()
df_test.shape

                               tags  \
count                          3504   
unique                          595   
top     2-4-years-experience-needed   
freq                            214   

                                              description  
count                                                4375  
unique                                               4375  
top     JOB SUMMARY  Maintain employee enrollment data...  
freq                                                    1  


(2921, 1)

In [19]:
tags = ["part-time-job", "full-time-job", "hourly-wage", "salary", "associate-needed", "bs-degree-needed", 
        "ms-or-phd-needed", "licence-needed", "1-year-experience-needed", "2-4-years-experience-needed", 
        "5-plus-years-experience-needed", "supervising-job"]

In [20]:
def preprocess(tag, train_df, test_df):
    
    x_train=[]
    y_train=[]
    for index,row in train_df.iterrows():
        x_train.append(row.description.lower())
        if type(row.name)==str and tag in row.name: ## some row names are empty that resolved by first condition
            y_train.append(1)
        else:
            y_train.append(0)
    
    x_test = []
    
    for index, row in test_df.iterrows():
        x_test.append(row.description.lower())
        
    return x_train,y_train,x_test
            

In [21]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,sentence):
        return [(self.wnl.lemmatize(word)) for word in word_tokenise(sentence)]


In [22]:
# https://github.com/scikit-learn/scikit-learn/issues/1156
# Snowball stemmers could be used as a dependency
from nltk.stem import SnowballStemmer

class build_stemmer(object):
    def __init__(self):
        self.wns = SnowballStemmer('english')
    def __call__(self, doc):
        return [self.wns.stem(t) for t in word_tokenize(doc)]
    

In [23]:
from nltk.stem.porter import *

class build_stemmer2(object):
    def __init__(self):
        self.wns = PorterStemmer()
    def __call__(self, doc):
        return [self.wns.stem(t) for t in word_tokenize(doc)] 

In [24]:
class lemma_stemmer(object):
    def __init__(self):
        self.wns = SnowballStemmer('english')
        self.wnl = WordNetLemmatizer() 
    def __call__(self, doc):
        return [self.wnl.lemmatize(self.wns.stem(t)) for t in word_tokenize(doc)]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import xgboost as xgb
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn import grid_search
import random
random.seed(1307)

def pred(tag, x_train, y_train, x_test):
    
    count_vect = CountVectorizer(tokenizer=build_stemmer(), ngram_range=(1, 3)) ## snowball stemmer standalone working is much better
    x_tr_count = count_vect.fit_transform(x_train)
    print x_tr_count.shape
    
    x_te_count = count_vect.transform(x_test)
    print x_te_count.shape
    
    tfidf_transformer = TfidfTransformer()
    x_tr_tfidf = tfidf_transformer.fit_transform(x_tr_count)
    print x_tr_tfidf.shape
    
    x_te_tfidf = tfidf_transformer.transform(x_te_count)
    print x_te_tfidf.shape
    
    print len(y_train)
    
    clf = xgb.XGBClassifier()
    
    param_grid = {
#        'scale_pos_weight' : [2, 2.5],
#         'learning_rate': [0.05],
#         'max_depth': [5, 7],
#         'subsample': [0.7, 0.8],
        'n_estimators': [100, 150, 200],
    }
    
    model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=5, cv=3, verbose=20, scoring = 'f1_micro')
    
    model.fit(x_tr_tfidf, y_train)
    
    predicted = model.predict(x_te_tfidf)
    
    print predicted.shape
    
    print(sum(predicted))
    return predicted

In [None]:
outputs = [""] * 2921

for i, tag in enumerate(tags):

    x_train, y_train, docs_new  = preprocess(tag, df_train, df_test)

    output = pred(tag, x_train, y_train, docs_new)
    
    for j, item in enumerate(output):
        if item == 1:
            if outputs[j] == "":
                outputs[j] = tag
            else:
                outputs[j] += " " + tag
            
for i, item in enumerate(outputs):
    if item == "":
        outputs[i] = " "

(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  48.7s
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  48.8s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   49.0s
[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   49.1s remaining:  2.9min


[CV] ....................... n_estimators=100, score=1.000000 -  49.3s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   49.7s remaining:  1.7min


[CV] ....................... n_estimators=150, score=1.000000 - 1.2min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.2min remaining:  1.5min


[CV] ....................... n_estimators=150, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.2min remaining:   56.6s


[CV] ....................... n_estimators=150, score=1.000000 - 1.0min


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  1.8min remaining:   54.7s


[CV] ....................... n_estimators=200, score=1.000000 - 1.3min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.1min remaining:   35.5s


[CV] ....................... n_estimators=200, score=1.000000 - 1.2min
[CV] ....................... n_estimators=200, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.2min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  56.2s
[CV] ....................... n_estimators=100, score=1.000000 -  56.2s
[CV] n_estimators=200 ................................................
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   56.5s
[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   56.5s remaining:  3.3min


[CV] ....................... n_estimators=100, score=1.000000 -  57.3s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   57.7s remaining:  1.9min


[CV] ....................... n_estimators=150, score=1.000000 - 1.4min
[CV] ....................... n_estimators=150, score=1.000000 - 1.4min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.4min remaining:  1.8min
[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.4min remaining:  1.1min


[CV] ....................... n_estimators=150, score=1.000000 - 1.3min


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  2.2min remaining:  1.1min


[CV] ....................... n_estimators=200, score=1.000000 - 1.5min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.4min remaining:   41.9s


[CV] ....................... n_estimators=200, score=1.000000 - 1.5min
[CV] ....................... n_estimators=200, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.6min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  53.2s
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   53.4s


[CV] ....................... n_estimators=100, score=1.000000 -  54.2s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   54.5s remaining:  3.2min


[CV] ....................... n_estimators=100, score=1.000000 -  54.7s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   55.0s remaining:  1.8min


[CV] ....................... n_estimators=150, score=1.000000 - 1.3min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.3min remaining:  1.7min


[CV] ....................... n_estimators=150, score=1.000000 - 1.3min


[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.3min remaining:  1.1min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  2.0min remaining:  1.0min


[CV] ....................... n_estimators=200, score=1.000000 - 1.3min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.2min remaining:   38.5s


[CV] ....................... n_estimators=200, score=1.000000 - 1.3min
[CV] ....................... n_estimators=200, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.4min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  56.2s
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   56.5s


[CV] ....................... n_estimators=100, score=1.000000 -  56.4s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  56.5s


[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   56.7s remaining:  3.3min
[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   56.9s remaining:  1.9min


[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=150, score=1.000000 - 1.3min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.3min remaining:  1.6min


[CV] ....................... n_estimators=150, score=1.000000 - 1.3min


[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.3min remaining:  1.1min


[CV] ....................... n_estimators=150, score=1.000000 -  59.7s


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  1.9min remaining:   58.1s


[CV] ....................... n_estimators=200, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.2min remaining:   37.6s


[CV] ....................... n_estimators=200, score=1.000000 - 1.3min
[CV] ....................... n_estimators=200, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.4min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  46.9s
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   47.2s


[CV] ....................... n_estimators=100, score=1.000000 -  47.6s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   47.9s remaining:  2.8min


[CV] ....................... n_estimators=100, score=1.000000 -  48.0s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   48.4s remaining:  1.6min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.1min remaining:  1.4min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.1min remaining:   54.3s


[CV] ....................... n_estimators=150, score=1.000000 -  59.5s


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  1.8min remaining:   53.4s


[CV] ....................... n_estimators=200, score=1.000000 - 1.2min
[CV] ....................... n_estimators=200, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.0min remaining:   34.3s


[CV] ....................... n_estimators=200, score=1.000000 - 1.0min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.1min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  46.8s
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   47.0s


[CV] ....................... n_estimators=100, score=1.000000 -  46.9s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  46.9s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   47.2s remaining:  2.8min
[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   47.2s remaining:  1.6min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.1min remaining:  1.4min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.1min remaining:   53.5s


[CV] ....................... n_estimators=150, score=1.000000 -  56.9s


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  1.7min remaining:   52.0s


[CV] ....................... n_estimators=200, score=1.000000 - 1.2min
[CV] ....................... n_estimators=200, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.0min remaining:   33.8s


[CV] ....................... n_estimators=200, score=1.000000 - 1.0min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.1min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  45.8s
[CV] n_estimators=150 ................................................


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   46.0s


[CV] ....................... n_estimators=100, score=1.000000 -  46.3s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   46.6s remaining:  2.7min


[CV] ....................... n_estimators=100, score=1.000000 -  46.5s
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   46.9s remaining:  1.6min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.1min remaining:  1.4min


[CV] ....................... n_estimators=150, score=1.000000 - 1.1min


[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.1min remaining:   54.1s


[CV] ....................... n_estimators=150, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  1.9min remaining:   58.3s


[CV] ....................... n_estimators=200, score=1.000000 - 1.4min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  2.2min remaining:   37.0s


[CV] ....................... n_estimators=200, score=1.000000 - 1.4min
[CV] ....................... n_estimators=200, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   9 out of   9 | elapsed:  2.3min finished


(2921,)
0
(4375, 1106916)
(2921, 1106916)
(4375, 1106916)
(2921, 1106916)
4375
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  46.1s
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  46.1s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=100, score=1.000000 -  46.2s


[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:   46.4s
[Parallel(n_jobs=5)]: Done   2 out of   9 | elapsed:   46.4s remaining:  2.7min
[Parallel(n_jobs=5)]: Done   3 out of   9 | elapsed:   46.5s remaining:  1.6min


[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=150, score=1.000000 - 1.1min
[CV] ....................... n_estimators=150, score=1.000000 - 1.1min
[CV] n_estimators=200 ................................................


[Parallel(n_jobs=5)]: Done   4 out of   9 | elapsed:  1.1min remaining:  1.4min
[Parallel(n_jobs=5)]: Done   5 out of   9 | elapsed:  1.1min remaining:   52.9s


[CV] ....................... n_estimators=150, score=1.000000 -  57.8s


[Parallel(n_jobs=5)]: Done   6 out of   9 | elapsed:  1.7min remaining:   52.1s


[CV] ....................... n_estimators=200, score=1.000000 - 1.2min


[Parallel(n_jobs=5)]: Done   7 out of   9 | elapsed:  1.9min remaining:   33.2s


[CV] ....................... n_estimators=200, score=1.000000 - 1.2min
[CV] ....................... n_estimators=200, score=1.000000 -  59.6s


In [None]:
# Save to file
import csv

with open("tags_xgboost-cv.tsv", 'w') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(["tags"]) 
    for val in outputs:
        wr.writerow([val])

