In [None]:
import numpy as np
import pandas as pd
import csv

import regex as re

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

In [None]:
#read train.tsv using pandas
train_data_file = pd.read_csv("train.tsv", sep = "\t")
test_data_file = pd.read_csv("test.tsv", sep = "\t")

In [None]:
expected_tags = pd.DataFrame(['part-time-job',
                             'full-time-job',
                             'hourly-wage',
                             'salary',
                             'associate-needed',
                             'bs-degree-needed',
                             'ms-or-phd-needed',
                             'licence-needed',
                             '1-year-experience-needed',
                             '2-4-years-experience-needed',
                             '5-plus-years-experience-needed',
                             'supervising-job'])

In [None]:
expected_tags

# Sample Output

In [None]:
train_data_file

In [None]:
test_data_file

# To remove rows with NaN values in column "tags"

In [None]:
train_data_file = train_data_file[train_data_file['tags'].isnull() == False]

# To calculate number of rows and columns

In [None]:
train_rows = len(train_data_file.axes[0])
train_cols = len(train_data_file.axes[1])

test_rows = len(test_data_file.axes[0])

In [None]:
train_rows

In [None]:
train_cols

# To update the index numbers of the rows

In [None]:
train_data_file.index = range(0, train_rows)

# To decode column "description"

In [None]:
train_data_file['description'] = train_data_file['description'].map(lambda x: x.decode("utf8").encode("ascii","ignore"))

test_data_file['description'] = test_data_file['description'].map(lambda x: x.decode("utf8").encode("ascii","ignore"))

# To update column 'description' based on word boundary

In [None]:
train_data_file['description'] = train_data_file['description'].map(lambda x: re.sub("\W+", ' ', x))

In [None]:
test_data_file['description'] = test_data_file['description'].map(lambda x: re.sub("\W+", ' ', x))

# To remove stop words from the column 'description'

In [None]:
train_data_file['description'] = train_data_file['description'].map(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))

In [None]:
test_data_file['description'] = test_data_file['description'].map(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))

# To convert the column 'description' to lower case

In [None]:
train_data_file['description'] = train_data_file['description'].map(lambda x: x.lower())

In [None]:
test_data_file['description'] = test_data_file['description'].map(lambda x: x.lower())

# To create mapping of tags to description

In [None]:
def create_mapping(tag_key, train):
    train_mapping = []
    
    for index, row in train.iterrows():
        if tag_key in row.tags:
            train_mapping.append(1)
        else:
            train_mapping.append(0)
            
    return train_mapping


# To create stem tokenizer

In [None]:
class ss_tokenizer(object):
    def __init__(self):
        self.wns = SnowballStemmer('english')
    def __call__(self, value):
        return [self.wns.stem(x) for x in word_tokenize(value)]

# XGBoost Approach

In [None]:
def xgb_classifier(tag_key, train_text, tag_train_mapping, test_text):
    
    count_vect = CountVectorizer(tokenizer = ss_tokenizer(), ngram_range = (1,4))
    X_train_counts = count_vect.fit_transform(train_text)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    clf = xgb.XGBClassifier()

    param_grid = {
        'n_estimators': [100,150,200],
    }

    model = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=2, verbose=20, scoring = 'f1_micro')

    model.fit(X_train_tfidf, tag_train_mapping)    

    X_new_counts = count_vect.transform(test_text)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    
    predicted = model.predict(X_new_tfidf)
    
    print sum(predicted), tag_key
        
    return predicted

# parent method

In [None]:
def solution1_1(classifier):
    para_tags = [" "] * test_rows
    for i in range(len(expected_tags)):
        tag = expected_tags.iloc[i,0]
        tag_train_mapping = create_mapping(tag, train_data_file)
        output = classifier(tag, train_data_file['description'], tag_train_mapping, test_data_file['description'])
        
        for index, item in enumerate(output):
            if item == 1:
                if para_tags[index] == " ":
                    para_tags[index] = tag
                else:
                    para_tags[index] += " " + tag
    
    return para_tags

In [None]:
run2 = solution1_1(xgb_classifier)

In [None]:
run2