In [132]:
import re
import sqlite3
import pandas as pd
import pymorphy2
from nltk.corpus import stopwords


pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)


DB_NAME = "descriptions1.db"
con = sqlite3.connect(DB_NAME)

cur = con.cursor()
try:
    cur.execute("DROP TABLE hh_tokens") # reset table
except Exception as e:
    print(e)
cur.execute("CREATE TABLE hh_tokens(hhid, title, tokens, skills, url)")
con.commit()


morph = pymorphy2.MorphAnalyzer()

def tokenize(s):
    tokens = re.findall("[\/\-а-яёa-z]+", s.lower())
    filtered = [morph.parse(i)[0].normal_form for i in tokens if i not in stopwords.words("russian")]  # нормализация - лемматизация
    return ' '.join(filtered)
    
df = pd.read_sql_query("SELECT * FROM hh_descriptions", con)
print(df.size)
df_tokens = df.copy()
df_tokens['description'] = df_tokens.apply(lambda row: tokenize(row['description']), axis=1)   # 9min 15s
print(df_tokens.size)

16170
16170


In [133]:
df_tokens.to_sql('hh_tokens', con, if_exists='replace')

3234

Часть 2: Векторизация

In [152]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sqlite3


DB_NAME = "descriptions1.db"
con = sqlite3.connect(DB_NAME)

class Vacancies:
    def __init__(self, vectorizer_type='tfidf', n_top=10):
        self.vectorizer_type = 'tfidf'
        self.descriptions_df = pd.read_sql_query("SELECT * FROM hh_descriptions", con)
        self.tokens_df = pd.read_sql_query("SELECT * FROM hh_tokens", con)
        #countvectorizer = CountVectorizer(analyzer='word',stop_words='english')
        #count_wm = countvectorizer.fit_transform(df.description)
        #count_tokens = countvectorizer.get_feature_names_out()
        #df_countvect = pd.DataFrame(data=count_wm.toarray(), columns=count_tokens)
        self.tfidfvectorizer = None
        self.tfidf_wm = None
        self.tfidf_tokens = None
        self.df_tfidfvect = None
        self.n_top = n_top

    def fit(self):
        self.tfidfvectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
        self.tfidf_wm = self.tfidfvectorizer.fit_transform(self.descriptions_df.description)
        self.tfidf_tokens = self.tfidfvectorizer.get_feature_names_out()
        self.df_tfidfvect = pd.DataFrame(data=self.tfidf_wm.toarray(), columns=self.tfidf_tokens)

    def _find_top_closest_ids(self, resume_text, ntop):
        resume_df = pd.DataFrame(self.tfidfvectorizer.transform([tokenize(resume_text)]).toarray(), columns=self.tfidf_tokens)
        df2 = pd.DataFrame(cosine_similarity(self.tfidf_wm, resume_df, dense_output=True), columns=['score'])
        closest_ids = df2.sort_values('score', ascending=False).iloc[:ntop]
        return closest_ids

    def _get_vacs_from_ids(self, ids):
        return ids.join(self.descriptions_df, how='inner')[["title", "description", "url", "score"]]\
            .to_json(orient="records", force_ascii=False)
    
    def get_matching_vacancies(self, resume_text):
        ids = self._find_top_closest_ids(resume_text, self.n_top)
        res = self._get_vacs_from_ids(ids)
        return res

In [153]:
resume_text = 'Data Scientist с опытом работы в проектах по продвинутой аналитике и исследованиях. Успешно реализовал проекты включая промышленные решения \
    для клиентов из различных областей: потребительские товары, ритейл, медицинские приборы и другие. Имеет страсть к автоматизации \
    на основе данных. Европейское гражданство. Разработка, алгоритмы, машинное обучение, проведение интервью\
    Data Science, Leadership, R&D, Machine Learning, Software Developement, Algorithms, Agile, Interviewing \
    Python (numpy, pandas, scipy, sklearn, matplotlib, seaborn, requests), DevOps (Linux, Git, Docker), Cloud (MS Azure, AWS),\
    WEB (Flask, Figma, HTML, CSS), Business Intelligence (Power BI), SQL'

vac_model = Vacancies()
vac_model.fit()
vac_model.get_matching_vacancies(resume_text)


'[{"title":"Data Scientist to Armenia","description":"We are looking for a Data Scientist to join an international fintech team.Your Role and Responsibilities: • Entities extraction: turning unstructured data into enriched ready datasets.• Collaboration with data science and technology teams, developing features and metrics for data processing• Processing Unstructured and Semi-structured Data• Transforming text data into internal infrastructure by using various algorithmic methods• Data enrichment by applying ML and NLP models Requirements:• Excellent expertise in data science• Understanding of data engineering and software development• \u200b\u200b\u200b\u200b\u200b\u200b\u200bStrong knowledge of machine learning methods and algorithms• Understanding of data structures and algorithms• Python or C++ programming skills We offer:• An international professional team• Relocation package• Work-Life Balance• Medical insurance \u200b\u200b\u200b\u200b\u200b\u200b\u200b","url":"https:\\/\\/hh.