In [None]:
import json
import pandas as pd
import numpy as np
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import sent_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


import warnings
warnings.simplefilter('ignore')

from config import CONFIG

In [2]:
def text_preprocessing(text: str) -> str:
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('russian'))
    tokens = [word for word in tokens if not word in stop_words]
#     stemmer = SnowballStemmer("russian") 
#     tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

In [3]:
df_news = pd.read_csv(f'{CONFIG.DATA_FOLDER}/russian_news.csv')
df_cvs = pd.read_csv(f'{CONFIG.DATA_FOLDER}/rabota_cvs.csv')
df_vacs = pd.read_csv(f'{CONFIG.DATA_FOLDER}/hh_vacancies.csv')

### Сравнение резюме и новостей

In [4]:
N = min(len(df_cvs), len(df_news))
cvs = df_cvs['text'][:N]
news = df_news['text'][:N]

cvs = [text_preprocessing(item) for item in cvs]
news = [text_preprocessing(item) for item in news]

In [5]:
corpus = cvs + news
vectorizer = CountVectorizer(max_features=5000)
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(corpus).toarray()
y =  [1] * N + [0] * N

In [6]:
def pprint_metrics_for_models(X, y, classifiers, metrics, print_train=False, print_test=True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)
    for classifier in classifiers:
        classifier.fit(X_train, y_train)
        y_train_pred = classifier.predict(X_train)
        y_pred = classifier.predict(X_test)
        print(f'{classifier.__class__.__name__} results')
        if print_train:
            print('Train results')
            for metric in metrics:
                print(f'{metric.__name__} : {round(metric(y_train, y_train_pred), 3)}')
        if print_test:
            print('Test results')
            for metric in metrics:
                print(f'{metric.__name__} : {round(metric(y_test, y_pred), 3)}')
        print('')


In [7]:
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]
metrics = [accuracy_score, precision_score, recall_score, f1_score]
pprint_metrics_for_models(X, y, classifiers, metrics)

GaussianNB results
Test results
accuracy_score : 0.997
precision_score : 1.0
recall_score : 0.994
f1_score : 0.997

MultinomialNB results
Test results
accuracy_score : 1.0
precision_score : 1.0
recall_score : 1.0
f1_score : 1.0

BernoulliNB results
Test results
accuracy_score : 0.999
precision_score : 0.998
recall_score : 1.0
f1_score : 0.999



### Сравнение вакансий и новостей

In [8]:
N = min(len(df_news), len(df_vacs))
vacs = df_vacs['text'][:N]
news = df_news['text'][:N]

vacs = [text_preprocessing(item) for item in vacs]
news = [text_preprocessing(item) for item in news]

In [9]:
corpus = vacs + news
# vectorizer = CountVectorizer(max_features=5000)
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(corpus).toarray()
y =  [1] * N + [0] * N

In [10]:
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]
metrics = [accuracy_score, precision_score, recall_score, f1_score]
pprint_metrics_for_models(X, y, classifiers, metrics)

GaussianNB results
Test results
accuracy_score : 0.995
precision_score : 0.99
recall_score : 1.0
f1_score : 0.995

MultinomialNB results
Test results
accuracy_score : 0.999
precision_score : 0.998
recall_score : 1.0
f1_score : 0.999

BernoulliNB results
Test results
accuracy_score : 0.989
precision_score : 1.0
recall_score : 0.978
f1_score : 0.989



### Сравнение резюме и вакансий

In [11]:
N = min(len(df_cvs), len(df_vacs))
cvs = df_cvs['text'][:N]
vacs = df_vacs['text'][:N]

cvs = [text_preprocessing(item) for item in cvs]
vacs = [text_preprocessing(item) for item in vacs]

In [12]:
corpus = cvs + vacs
# vectorizer = CountVectorizer(max_features=5000)
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(corpus).toarray()
y =  [1] * N + [0] * N

In [13]:
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB()]
metrics = [accuracy_score, precision_score, recall_score, f1_score]
pprint_metrics_for_models(X, y, classifiers, metrics)

GaussianNB results
Test results
accuracy_score : 0.979
precision_score : 1.0
recall_score : 0.958
f1_score : 0.979

MultinomialNB results
Test results
accuracy_score : 0.997
precision_score : 0.998
recall_score : 0.996
f1_score : 0.997

BernoulliNB results
Test results
accuracy_score : 0.983
precision_score : 0.969
recall_score : 0.998
f1_score : 0.983

