### Классификатор, определяющий, является ли текст резюме или нет

In [1]:
import os
import json
import pandas as pd
import numpy as np
from joblib import dump, load

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

import warnings
warnings.simplefilter('ignore')

np.random.seed(0)

from config import CONFIG
from utils import text_preprocessing

In [2]:
df_news = pd.read_csv(f'{CONFIG.DATA_FOLDER}/russian_news.csv')
df_remote_cvs = pd.read_csv(f'{CONFIG.DATA_FOLDER}/remote_cvs.csv')
df_hh_cvs = pd.read_csv(f'{CONFIG.DATA_FOLDER}/hh_cvs.csv')
df_rabota_cvs = pd.read_csv(f'{CONFIG.DATA_FOLDER}/rabota_cvs.csv')
df_vacs = pd.read_csv(f'{CONFIG.DATA_FOLDER}/hh_vac.csv')

In [3]:
N = min(list(map(len, [df_remote_cvs, df_hh_cvs, df_rabota_cvs])))
df_cvs = pd.DataFrame()
for df_tmp in [df_remote_cvs, df_hh_cvs, df_rabota_cvs]:
    df_cvs = df_cvs.append(df_tmp[:N], ignore_index=True)
    
df_cvs.dropna(inplace=True)
np.random.seed(0)
texts = df_cvs['text'].values
np.random.shuffle(texts)
df_cvs['text'] = texts
df_cvs.head()

Unnamed: 0,text
0,Тестировщик / QA Engineer от 45 000 руб. Росси...
1,Мужчина 28 лет Санкт-Петербург Создание сайтов...
2,"Контент менеджер от 35 000 руб. Россия, Москва..."
3,"Младший программист 25 000 руб. Linux, css, ht..."
4,"Программист от 40 000 руб. Россия, Томская обл..."


In [4]:
N = min(len(df_news), len(df_vacs))
df_other = pd.DataFrame()
for df_tmp in [df_news, df_vacs]:
    df_other = df_other.append(df_tmp[:N], ignore_index=True)

In [5]:
N = min(len(df_cvs), len(df_other))
cvs = df_cvs['text'].values
other = df_other['text'].values
np.random.shuffle(cvs)
np.random.shuffle(other)

cvs = [text_preprocessing(item) for item in cvs[:N]]
other = [text_preprocessing(item) for item in other[:N]]

### Кросс-валидация

In [6]:
corpus = cvs + other
X = np.array(corpus)
y = np.array([1] * len(cvs) + [0] * len(other))

In [7]:
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), LogisticRegression()]
metrics = [accuracy_score, precision_score, recall_score, f1_score]
skf = StratifiedKFold(n_splits=5)
cross_val_results = {
    classifier.__class__.__name__: {metric.__name__ : [] for metric in metrics}
    for classifier in classifiers
}

for train_idx, test_idx in skf.split(X, y):
    vectorizer = TfidfVectorizer(max_features=1000)
    vectorizer.fit(X[train_idx])
    X_train = vectorizer.transform(X[train_idx]).toarray()
    X_test = vectorizer.transform(X[test_idx]).toarray()
    y_train = y[train_idx]
    y_test = y[test_idx]
    for classifier in classifiers:
        classifier.fit(X_train, y_train)
        y_train_pred = classifier.predict(X_train)
        y_pred = classifier.predict(X_test)
        for metric in metrics:
            classifier_name = classifier.__class__.__name__
            metric_name = metric.__name__
            cross_val_results[classifier_name][metric_name].append(metric(y_test, y_pred))

In [8]:
mean_results = {
    classifier_name: {
        metric_name: np.mean(metric_arr) 
        for metric_name, metric_arr in item.items()
    } for classifier_name, item in cross_val_results.items()
}
df_score = pd.DataFrame(mean_results)
df_score.head()

Unnamed: 0,GaussianNB,MultinomialNB,BernoulliNB,LogisticRegression
accuracy_score,0.990239,0.98999,0.987237,0.996622
f1_score,0.990163,0.989906,0.987227,0.99661
precision_score,0.997213,0.997714,0.988492,0.999499
recall_score,0.983231,0.982232,0.985986,0.993744


### Обучение финальной модели

In [9]:
vectorizer = TfidfVectorizer(max_features=1000)
classifier = LogisticRegression()
X_transformed = vectorizer.fit_transform(X).toarray()
classifier.fit(X_transformed, y)

LogisticRegression()

In [10]:
if not os.path.exists(CONFIG.MODELS_FOLDER):
    os.makedirs(CONFIG.MODELS_FOLDER)

dump(vectorizer, f'{CONFIG.MODELS_FOLDER}/{CONFIG.BINARY_VECTORIZER_NAME}') 
dump(classifier, f'{CONFIG.MODELS_FOLDER}/{CONFIG.BINARY_CLASSIFER_NAME}') 

['models/binary_classifier.joblib']

### Важность признаков

In [11]:
def get_feature_importance_for_lr_clf(lr_clf, vect):
    words = vect.get_feature_names()
    zipped = list(zip(words, lr_clf.coef_[0]))
    return sorted(zipped, key=lambda t: t[1], reverse=True)

feature_importance = get_feature_importance_for_lr_clf(classifier, vectorizer)
for item in feature_importance[:10]:
    print(item)
    
print('\n')
for item in feature_importance[-10:]:
    print(item)

('руб', 5.5144168342312705)
('россия', 5.383659559388919)
('москва', 4.149944713425877)
('область', 4.0829208916524955)
('образование', 2.6342952608768475)
('русский', 2.48109594214575)
('университет', 2.348064401056707)
('имею', 2.335757070736375)
('adobe', 2.322343379427084)
('высшее', 2.318241016092196)


('указана', -2.7163239859500017)
('зп', -2.760867254154654)
('сообщает', -2.814353678778332)
('полный', -2.83281544601163)
('требования', -2.8846555058153434)
('условия', -2.9114144241243474)
('апреля', -2.986869101992444)
('полная', -3.621723725243273)
('занятость', -3.657194246917162)
('руки', -3.7198412488898867)
