In [1]:
import numpy as np
import pandas as pd
import csv

import regex as re

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

In [2]:
#read train.tsv using csv
#with open('train.tsv') as train_data_file:
#    for line in csv.reader(train_data_file, delimiter = "\t"):
#        print line

In [3]:
#read train.tsv using pandas
train_data_file = pd.read_csv("train.tsv", sep = "\t")
test_data_file = pd.read_csv("test.tsv", sep = "\t")

In [4]:
expected_tags = pd.DataFrame(['part-time-job',
                             'full-time-job',
                             'hourly-wage',
                             'salary',
                             'associate-needed',
                             'bs-degree-needed',
                             'ms-or-phd-needed',
                             'licence-needed',
                             '1-year-experience-needed',
                             '2-4-years-experience-needed',
                             '5-plus-years-experience-needed',
                             'supervising-job'])

In [5]:
expected_tags

Unnamed: 0,0
0,part-time-job
1,full-time-job
2,hourly-wage
3,salary
4,associate-needed
5,bs-degree-needed
6,ms-or-phd-needed
7,licence-needed
8,1-year-experience-needed
9,2-4-years-experience-needed


# Sample Output

In [6]:
train_data_file

Unnamed: 0,tags,description
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...
2,part-time-job,This is a great position for the right person....
3,licence-needed,A large multi-specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...
5,associate-needed 5-plus-years-experience-needed,"At Cottage Health System, our facilities are s..."
6,full-time-job ms-or-phd-needed 5-plus-years-ex...,*E Learning Instructional Designer with at lea...
7,bs-degree-needed 5-plus-years-experience-needed,Sales and Service Representative SAF-Hollan...
8,bs-degree-needed 2-4-years-experience-needed,"SAF-Holland, Inc. is currently seeking an Inte..."
9,full-time-job hourly-wage,Immediate opening for full time staffing coord...


In [7]:
test_data_file

Unnamed: 0,description
0,Integrity Home Care has an opening on our Lead...
1,We're looking for a precision grinder with at ...
2,In Home Sales Designer Georgia Shutters– Augus...
3,Advanced Radiotherapy Consulting is seeking an...
4,Position Requirements: · Professional ...
5,Hibachi Teppanyaki restaurant is hiring serve...
6,"CASTING Movie Extras, Actors, Models for TV, C..."
7,"Responsibilities: • Provides training, suppor..."
8,Informa plc is the leading international provi...
9,Our emphasis is on Excellence. We have positi...


# To remove rows with NaN values in column "tags"

In [8]:
train_data_file = train_data_file[train_data_file['tags'].isnull() == False]

# To calculate number of rows and columns

In [9]:
train_rows = len(train_data_file.axes[0])
train_cols = len(train_data_file.axes[1])

test_rows = len(test_data_file.axes[0])

In [10]:
train_rows

3504

In [11]:
train_cols

2

# To update the index numbers of the rows

In [12]:
train_data_file.index = range(0, train_rows)

# To decode column "description"

In [13]:
train_data_file['description'] = train_data_file['description'].map(lambda x: x.decode("utf8").encode("ascii","ignore"))

test_data_file['description'] = test_data_file['description'].map(lambda x: x.decode("utf8").encode("ascii","ignore"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


# To update column 'description' based on word boundary

In [14]:
train_data_file['description'] = train_data_file['description'].map(lambda x: re.sub("\W+", ' ', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
test_data_file['description'] = test_data_file['description'].map(lambda x: re.sub("\W+", ' ', x))

# To remove stop words from the column 'description'

In [16]:
train_data_file['description'] = train_data_file['description'].map(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [17]:
test_data_file['description'] = test_data_file['description'].map(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))

# To convert the column 'description' to lower case

In [18]:
train_data_file['description'] = train_data_file['description'].map(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
test_data_file['description'] = test_data_file['description'].map(lambda x: x.lower())

# To create mapping of tags to description

In [20]:
def create_mapping(tag_key, train):
    train_mapping = []
    
    for index, row in train.iterrows():
        if tag_key in row.tags:
            train_mapping.append(1)
        else:
            train_mapping.append(0)
            
    return train_mapping


# To create stem tokenizer

In [21]:
class ss_tokenizer(object):
    def __init__(self):
        self.wns = SnowballStemmer('english')
    def __call__(self, value):
        return [self.wns.stem(x) for x in word_tokenize(value)]

# Naive Bayes Approach

In [22]:
def nb_classifier(tag_key, train_text, tag_train_mapping, test_text):
    
    count_vect = CountVectorizer(tokenizer = ss_tokenizer(), ngram_range = (1,4))
    X_train_counts = count_vect.fit_transform(train_text)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    clf = MultinomialNB(alpha = 0.001).fit(X_train_tfidf, tag_train_mapping)
    
    X_new_counts = count_vect.transform(test_text)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    
    predicted = clf.predict(X_new_tfidf)
    
    print sum(predicted), tag_key
    
    return predicted
        

# XGBoost Approach

In [None]:
def xgb_classifier(tag_key, train_text, tag_train_mapping, test_text):
    
    count_vect = CountVectorizer(tokenizer = ss_tokenizer(), ngram_range = (1,4))
    X_train_counts = count_vect.fit_transform(train_text)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    clf = xgb.XGBClassifier()

    param_grid = {
        'n_estimators': [100, 150, 200],
    }

    model = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=4, cv=3, verbose=30, scoring = 'f1_micro')

    model.fit(X_train_tfidf, tag_train_mapping)    

    X_new_counts = count_vect.transform(test_text)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    
    predicted = model.predict(X_new_tfidf)
    
    print sum(predicted), tag_key
        
    return predicted

# parent method

In [24]:
def solution1_1(classifier):
    para_tags = [" "] * test_rows
    for i in range(len(expected_tags)):
        tag = expected_tags.iloc[i,0]
        tag_train_mapping = create_mapping(tag, train_data_file)
        output = classifier(tag, train_data_file['description'], tag_train_mapping, test_data_file['description'])
        
        for index, item in enumerate(output):
            if item == 1:
                if para_tags[index] == " ":
                    para_tags[index] = tag
                else:
                    para_tags[index] += " " + tag
    
    return para_tags

In [25]:
run1 = solution1_1(nb_classifier)

63 part-time-job
247 full-time-job
136 hourly-wage
128 salary
31 associate-needed
699 bs-degree-needed
14 ms-or-phd-needed
225 licence-needed
50 1-year-experience-needed
320 2-4-years-experience-needed
275 5-plus-years-experience-needed
212 supervising-job


In [None]:
run1

In [None]:
run2 = solution1_1(xgb_classifier)

203 part-time-job


In [None]:
run2