In [1]:
# https://www.kaggle.com/c/vacancy-classification-sf02

In [25]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('train.csv', sep='\t', index_col='id')
test = pd.read_csv('test.csv', sep='\t', index_col='id')
train.head()

Unnamed: 0_level_0,name,description,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Заведующий отделом/секцией в магазин YORK (Уру...,<p><strong>В НОВЫЙ МАГАЗИН YORK (хозтовары) пр...,1
1,Наладчик станков и манипуляторов с ПУ,Обязанности:работа на токарных станках с ЧПУ T...,0
2,Разработчик С++ (Криптограф),<strong>Требования:</strong> <ul> <li>Опыт про...,0
3,Фрезеровщик,<p>Условия:</p> <ul> <li>На работу вахтовым ме...,0
4,Мерчендайзер/продавец-консультант,<p><strong>Компания Палладиум Стандарт - призн...,1


In [9]:
train.loc[:, 'text'] = train['name']+' '+train['description']
test.loc[:, 'text'] = test['name']+' '+test['description']
train['text'][:5]

id
0    Заведующий отделом/секцией в магазин YORK (Уру...
1    Наладчик станков и манипуляторов с ПУ Обязанно...
2    Разработчик С++ (Криптограф) <strong>Требовани...
3    Фрезеровщик <p>Условия:</p> <ul> <li>На работу...
4    Мерчендайзер/продавец-консультант <p><strong>К...
Name: text, dtype: object

In [10]:
train_wo_tags = []
for t in train['text']:
    train_wo_tags.append(
        BeautifulSoup(t, 'lxml').text
    )

test_wo_tags = []
for t in test['text']:
    test_wo_tags.append(
        BeautifulSoup(t, 'lxml').text
    )

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train_wo_tags, train['target'], test_size=0.2, shuffle=True, random_state=42)

stop_words = nltk.corpus.stopwords.words('russian')

model = Pipeline([
    ('vect', TfidfVectorizer(stop_words=stop_words,
                             ngram_range=(1,1),
                             max_df=0.95, min_df=5)),
    ('clf', LogisticRegression(random_state=42)),
])

model.fit(X_train, y_train)

roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

0.9897672024061035

In [6]:
model2 = Pipeline([
    ('vect', CountVectorizer(stop_words=stop_words, 
                             ngram_range=(1,1))),
    ('clf', LogisticRegression(random_state=42)),
])

model2.fit(X_train, y_train)

roc_auc_score(y_test, model2.predict_proba(X_test)[:,1])

0.9846920048449948

In [41]:
# previous best result
submission = pd.DataFrame()
submission.loc[:,'id'] = test.reset_index()['id']
submission.loc[:,'target'] = model.predict_proba(test_wo_tags)[:, 1]
submission.to_csv('submission.csv', sep=',', index=False, encoding='utf-8')

In [44]:
stop_words = nltk.corpus.stopwords.words('russian')
vect = CountVectorizer(stop_words=stop_words,
                             ngram_range=(1,1),
                             max_df=0.95, min_df=5)
X = vect.fit_transform(train_wo_tags)
y = train['target']
X_new = vect.transform(test_wo_tags)

In [45]:
models = [
    RandomForestClassifier(random_state=42),
    LogisticRegression(random_state=42)
    ]

In [46]:
def roc_auc_score_cust(y_true, y_hat):
    return roc_auc_score(y_true, y_hat[:,1])

In [47]:
from vecstack import stacking

S_train, S_valid = stacking(models,
                               X, y, X_new,
                               regression=False,
                               mode='oof_pred_bag', 
                               needs_proba=True,
                               metric=roc_auc_score_cust,
                               n_folds=3,                
                               stratified=True,          
                               shuffle=True,             
                               random_state=42,         
                               verbose=2)

last_model = GradientBoostingClassifier(n_estimators=300, max_depth=3,
                                       learning_rate=0.01,
                                       random_state=42)
last_model.fit(S_train, y)
y_hat = last_model.predict_proba(S_valid)

roc_auc_score(y, last_model.predict_proba(S_train)[:,1])

task:         [classification]
n_classes:    [2]
metric:       [roc_auc_score_cust]
mode:         [oof_pred_bag]
n_models:     [2]

model  0:     [RandomForestClassifier]
    fold  0:  [0.98689589]
    fold  1:  [0.98647909]
    fold  2:  [0.98685573]
    ----
    MEAN:     [0.98674357] + [0.00018773]
    FULL:     [0.98674363]

model  1:     [LogisticRegression]
    fold  0:  [0.98897560]
    fold  1:  [0.98949196]
    fold  2:  [0.98915512]
    ----
    MEAN:     [0.98920756] + [0.00021404]
    FULL:     [0.98920354]



0.9924187194469127

In [48]:
# actual best result
submission = pd.DataFrame()
submission.loc[:,'id'] = test.reset_index()['id']
submission.loc[:,'target'] = y_hat[:, 1]
submission.to_csv('submission.csv', sep=',', index=False, encoding='utf-8')