In [1]:
# https://www.kaggle.com/c/vacancy-classification-sf02

In [2]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [3]:
train = pd.read_csv('train.csv', sep='\t', index_col='id')
test = pd.read_csv('test.csv', sep='\t', index_col='id')
train.head()

Unnamed: 0_level_0,name,description,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Заведующий отделом/секцией в магазин YORK (Уру...,<p><strong>В НОВЫЙ МАГАЗИН YORK (хозтовары) пр...,1
1,Наладчик станков и манипуляторов с ПУ,Обязанности:работа на токарных станках с ЧПУ T...,0
2,Разработчик С++ (Криптограф),<strong>Требования:</strong> <ul> <li>Опыт про...,0
3,Фрезеровщик,<p>Условия:</p> <ul> <li>На работу вахтовым ме...,0
4,Мерчендайзер/продавец-консультант,<p><strong>Компания Палладиум Стандарт - призн...,1


In [4]:
train.loc[:, 'text'] = train['name']+' '+train['description']
test.loc[:, 'text'] = test['name']+' '+test['description']
train['text'][:5]

id
0    Заведующий отделом/секцией в магазин YORK (Уру...
1    Наладчик станков и манипуляторов с ПУ Обязанно...
2    Разработчик С++ (Криптограф) <strong>Требовани...
3    Фрезеровщик <p>Условия:</p> <ul> <li>На работу...
4    Мерчендайзер/продавец-консультант <p><strong>К...
Name: text, dtype: object

In [5]:
train_wo_tags = []
for t in train['text']:
    train_wo_tags.append(
        BeautifulSoup(t, 'lxml').text
    )

test_wo_tags = []
for t in test['text']:
    test_wo_tags.append(
        BeautifulSoup(t, 'lxml').text
    )

In [27]:
stop_words = list()
stop_words.append(nltk.corpus.stopwords.words('english'))
stop_words.append(nltk.corpus.stopwords.words('russian'))

tuple(stop_words)

(['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
  'off',
  'over',
  'under',
  'again',
  'further',
  'th

In [39]:
X_train, X_test, y_train, y_test = train_test_split(train_wo_tags, train['target'], test_size=0.2, shuffle=True, random_state=42)

stop_words = nltk.corpus.stopwords.words('russian')

model = Pipeline([
    ('vect', TfidfVectorizer(stop_words=stop_words,
                             ngram_range=(1,1),
                             max_df=0.95, min_df=5)),
    ('clf', LogisticRegression(random_state=42)),
])

model.fit(X_train, y_train)

roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

0.9914272048856703

In [38]:
model2 = Pipeline([
    ('vect', CountVectorizer(stop_words=stop_words, 
                             ngram_range=(1,1))),
    ('clf', LogisticRegression(random_state=42)),
])

model2.fit(X_train, y_train)

roc_auc_score(y_test, model2.predict_proba(X_test)[:,1])

0.9898140482056054

In [41]:
submission = pd.DataFrame()
submission.loc[:,'id'] = test.reset_index()['id']
submission.loc[:,'target'] = model.predict_proba(test_wo_tags)[:, 1]
submission.to_csv('submission.csv', sep=',', index=False, encoding='utf-8')