In [91]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
import re

# Getting job requerments function
def get_job_requerments(job : str, outputfile: str, start_page : int, end_page) -> None:
    
    # initializing lists with information about job
    jobs_content = []  # list with page content about job
    jobs_names = []  # list with names of the jobs
    jobs_links = []  # list with links to the job page
    all_jobs_links = [] # list of all jobs links
    
    # going through all pages at hh.ru site and collecting links that
    # refer to job variable and also get content from them
    for page_number in range(start_page, end_page):
        # creating link for our job and current page
        jobs_page = 'https://spb.hh.ru/search/vacancy?area=2&st=searchVacancy&text=' + job.lower() + '&page=' + str(
            page_number)
    
        # getting current hh.ru page with our job
        page = requests.get(jobs_page, headers={'User-Agent': 'Custom'})
        
        # clearing jobs links list
        jobs_links.clear()
    
        # cheking if page is ready to
        # bring us some data, else getting error code
        if page.status_code == 200:
            # parsing page using Beautiful soup
            soup = BeautifulSoup(page.text, 'html.parser')
    
            # forming vacancies list from link that
            # refer to partucular vacancy
            vacancies_list = soup.find_all('a', {'class': 'bloko-link HH-LinkModifier'})
    
            # cheking if vacancies list is not empty
            # and if so, getting vacancy name
            # and vacancy link, then retrieve data from vacancy page
            if len(vacancies_list) > 0:
                # collecting links and names of vacancies
                for vacancy in vacancies_list:
                    jobs_names.append(vacancy.text)
                    if vacancy['href']:
                        jobs_links.append(vacancy['href'])
                        all_jobs_links.append(vacancy['href'])
                    else:
                        print('No job link')
                        jobs_links.append(None)
    
                # getting page content for each vacancy
                for link in jobs_links:
                    # getting vacancy page
                    job_page = requests.get(link, headers={'User-Agent': 'Custom'})
    
                    # checking if page is ready to
                    # bring us some data, else getting error code
                    if job_page.status_code == 200:
                        # parsing vacancy page using Beautiful Soup
                        job_soup = BeautifulSoup(job_page.text, 'html.parser')
    
                        # getting vacancy page content
                        page_content = job_soup.find('div', {'class': 'g-user-content'})
    
                        # forming jobs content list
                        jobs_content.append(page_content)
                    else:
                        print("Something wrong with the page: ", job_page.status_code)
                        print('vacancy problem')
            else:
                print('No items in vacancies_list')
        else:
            print('Something wrong with page: ', page.status_code)
            print('GEneral page problem')
    
    # initializing list with all data about vacancies
    data_list = []
    
    # going through jobs contents and splitting it
    # by <strong>. So we get all important headings
    # and will be able to get requesments, conditions
    # and responsibilities
    for job_content in jobs_content:
        data_list.append(re.split('<strong>', str(job_content).lower()))
    
    # initializing lists with information about vacancy
    jobs_treb = []  # list of vacancy requerments
    jobs_usl = []  # list of vancy conditions
    jobs_obyaz = []  # list of vacancy responsibilities
    jobs_desc = []  # list of vacancy desctiption
    
    # going through splitted data and getting
    # requerments, conditions, responsibilities
    # and descriptions
    for job in data_list:
        # getting descriptiong because
        # it is always first in the split
        jobs_desc.append(job[0])
    
        # initializing flags witch note if there is
        # one of requesments, conditions and responsibilities
        # in the split
        treb_flag = False
        obyaz_flag = False
        usl_flag = False
    
        # checking if there is one of three conditions
        # in our split and if so, adding this conditions
        # to corresponding lists
        for job_content in job:
            if job_content.startswith('требования'):
                jobs_treb.append(job_content)
                treb_flag = True
    
            if job_content.startswith('обязанности'):
                jobs_obyaz.append(job_content)
                obyaz_flag = True
    
            if job_content.startswith('условия'):
                jobs_usl.append(job_content)
                usl_flag = True
    
        # if we haven't found any conditions
        # we add None item to corresponding list
        if treb_flag == False:
            jobs_treb.append(None)
    
        if obyaz_flag == False:
            jobs_obyaz.append(None)
    
        if usl_flag == False:
            jobs_usl.append(None)
    
    # initializing lists with clean data about requerments, conditions
    # and responsibilities
    new_jobs_treb = []  # list of vacancy requerments
    new_jobs_obyaz = []  # list of vacancy responsibilities
    new_jobs_usl = []  # list of vacancy conditions
    
    
    # Cleaning informations funcition
    # Arguments:
    #  Data we want to clean and list where we want
    # .  to put this data
    # . (data, data_list)
    #  Returns:
    # .  None
    def clear_data(data, data_list):
        # going through the data and firstly cleaning
        # out of three possible conditions, then split
        # out data by html tags to form list or conditions
        # and cleanign data out of usless symbols
        for elem in data:
            if elem != None:
                # cleaning of three conditions
                item = re.sub('требования', '', elem)
                item2 = re.sub('к кандидату', '', item)
                item3 = re.sub('к кандидатам', '', item2)
                item4 = re.sub('условия', '', item3)
                item5 = re.sub('обязанности', '', item4)
    
                # splitting by html tags
                splited_items = re.split(r'<.*?>', item5)
    
                # initializing list with clean items
                cleared_items = []
    
                # going through splitted items, cleaning
                # them and adding to cleared items list
                for item in splited_items:
                    cleared_items.append(re.sub(r'[^\w\d\s]+', '', re.sub(r'\s+', ' ', re.sub(r'<.*?>', '', item))))
    
                # deleating all empty items
                while ("" in cleared_items):
                    cleared_items.remove("")
    
                # deleating all space items
                while (" " in cleared_items):
                    cleared_items.remove(" ")
    
                # adding cleaned items to data_list
                data_list.append(cleared_items)
            else:
                # if no element in data, adding None
                data_list.append(None)
    
    
    # claning information
    clear_data(jobs_obyaz, new_jobs_obyaz)
    clear_data(jobs_usl, new_jobs_usl)
    clear_data(jobs_treb, new_jobs_treb)
    
    # initializing list for cleaned
    # description data
    new_jobs_desc = []
    
    # going through all descriptions
    # and cleaning it
    for desc in jobs_desc:
        # if something in descriotion, cleaning
        # it, overwise, adding None
        if desc != None:
            item = re.sub(r'[^\w\d\s]+', '', re.sub(r'\s+', ' ', re.sub(r'<.*?>', '', desc)))
            new_jobs_desc.append(item)
        else:
            new_jobs_desc.append(None)
            
    all_data = new_jobs_obyaz
    
    new_all_d = pd.Series(all_data)
    
    d = new_all_d.dropna()
    
    with open(outputfile, 'a') as f:
        for elem in d:
            f.write('@'.join(elem))
            
def read_info(file : str, tag : str):
    with open(file) as f:
        data = (f.read()).split('@')
        
        y_data = [tag for x in range(len(data))]
    
    return data, y_data

# driver_data, y_driver_data = read_info('driver_data.txt', 'driver')
meneger_sells_data, y_meneger_sells_data = read_info('meneger_sells', 'meneger_sells')
web_development_data, y_web_development_data = read_info('web development', 'web development')
data_bases_data, y_data_bases_data = read_info('data bases', 'data bases')
old_web_developer_data, y_old_web_developer_data = read_info('web_development_data.txt', 'web development')
# network_admin_data, y_network_data = read_info('Network admin', 'Network admin')
# system_engeneer_data, y_system_engeneer_data = read_info('System engeneer', 'System engeneer')
# information_security, y_information_security = read_info('Information security', 'Information security')


In [85]:
get_job_requerments('Базы данных', 'data bases', start_page=31, end_page=50)
get_job_requerments('Менеджер по продажам', 'meneger_sells', start_page=31, end_page=50)
get_job_requerments('веб разработчик', 'web development', start_page=31, end_page=50)

No items in vacancies_list
No items in vacancies_list
No items in vacancies_list
No items in vacancies_list
No items in vacancies_list
No items in vacancies_list
No items in vacancies_list
No items in vacancies_list
No items in vacancies_list


In [22]:
get_job_requerments('Сетевой администратор', 2, 'Network admin')
get_job_requerments('Системный инженер', 4, 'System engeneer')
get_job_requerments('Специалист по информационной безопасности', 6, 'Information security')

In [87]:
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

mystem = Mystem()
russian_stopwords = stopwords.words('russian')

def preprocess_text(data_list):
    new_list = []
    for item in data_list:
        tokenizer = nltk.tokenize.TreebankWordTokenizer()
        new_data = tokenizer.tokenize(item)
        
        data = [''.join(mystem.lemmatize(token)) for token in new_data]
        tokens = [token for token in data if token not in russian_stopwords\
                  and token != ' '\
                  and token.strip() not in punctuation]
        
        text = ' '.join(tokens)
        
        new_text = re.sub('\n', '', text)
        
        new_list.append(new_text)
    return new_list

In [94]:
new_web_develoment_data = preprocess_text(web_development_data) + preprocess_text(old_web_developer_data)


new_meneger_sells_data = preprocess_text(meneger_sells_data)

new_data_bases_data = preprocess_text(data_bases_data)

print(len(new_web_develoment_data) ,len(new_meneger_sells_data), len(new_data_bases_data))
#new_network_admin_data = preprocess_text(network_admin_data)

#new_system_engeneer_data = preprocess_text(system_engeneer_data)

#new_information_security = preprocess_text(information_security)

all_data = new_web_develoment_data +  new_meneger_sells_data + new_data_bases_data
                                                                                             
all_y_data = y_web_development_data + y_old_web_developer_data + y_meneger_sells_data + y_data_bases_data

2601 2803 2746


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

vectorizer = CountVectorizer(ngram_range=(1, 2))
tfvect = TfidfTransformer()

all_data_transformed = vectorizer.fit_transform(all_data)

data_transformed = tfvect.fit_transform(all_data_transformed)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data_transformed, all_y_data)

In [14]:
from sklearn.naive_bayes import MultinomialNB

In [29]:
model = MultinomialNB()

model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
predicted = model.predict(X_test)

In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predicted))


0.7217068645640075


In [96]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

pipe = Pipeline([('vect', CountVectorizer(min_df = 3, stop_words=russian_stopwords)),
                ('tfidf', TfidfTransformer()),
                ('clf', SVC(random_state=1))])


parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
X_train, X_test, y_train, y_test = train_test_split(all_data, all_y_data)

grid_s = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)

grid_s_fit = grid_s.fit(X_train, y_train)

predicted = grid_s_fit.predict(X_test)

accuracy_score(y_test, predicted)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.5min finished


0.7894995093228655

In [95]:
new_pipe = Pipeline([('vect', CountVectorizer(min_df = 3, stop_words=russian_stopwords)),
                ('tfidf', TfidfTransformer()),
                ('NB', MultinomialNB())])

new_parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'NB__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

new_grid_s = GridSearchCV(new_pipe, new_parameters, n_jobs=-1, verbose=1)

new_grid_s_fit = new_grid_s.fit(X_train, y_train)

new_predicted = new_grid_s_fit.predict(X_test)

accuracy_score(y_test, new_predicted)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    5.1s finished


0.7161410018552876

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=3,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=['и', 'в', 'во', 'не', 'что', 'он',
                                             'на', 'я', 'с', 'со', 'как', 'а',
                                             'то', 'все', 'она', 'так', 'его',
                                             'но', 'да', 'ты', 'к', 'у', 'же',
                                             'вы', 'за', 'бы', 'по', 'только',
                                             'ее', 'мне', ...],
                                 strip_accents=None,
                            

In [97]:
grid_s_fit.best_estimator_

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=3,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=['и', 'в', 'во', 'не', 'что', 'он',
                                             'на', 'я', 'с', 'со', 'как', 'а',
                                             'то', 'все', 'она', 'так', 'его',...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_

In [107]:
input_requerments = ['умение продавать товары',
                     'общительность при общении с клиентами']

def make_prediction(requerments: list):
    new_input = preprocess_text(requerments)
        
    text = ' '.join(new_input)

    
    predict = new_grid_s_fit.predict([text])
    
    return predict
    
pred = make_prediction(input_requerments)

In [108]:
pred


array(['meneger_sells'], dtype='<U15')