# Импорт библиотек и загрузка данных

In [1]:
! pip install spacy
! python -m spacy download ru_core_news_sm
! python -m spacy download en_core_web_sm

Collecting ru-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl (15.3 MB)
     ---------------------------------------- 15.3/15.3 MB 1.9 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 4.1 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
import spacy
from langdetect import detect

In [4]:
file_path = 'post_users_links.csv'

In [5]:
data = pd.read_csv(file_path)

In [6]:
data.drop_duplicates(inplace=True)

# Определение языковых моделей

In [7]:
nlp_eng = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
nlp_rus = spacy.load('ru_core_news_sm', disable=['ner', 'parser'])

stop_words_rus = set(stopwords.words('russian'))
stop_words_eng = set(stopwords.words('english'))
stop_words = stop_words_rus.union(stop_words_eng)

words_to_exclude = set(punctuation)

# Очистка и лемматизация постов

In [8]:
filtered_data = data.copy()

selected_posts = []
all_sentences = []

for index in range(len(filtered_data["posts"])):
    value = filtered_data.loc[index, "posts"]
    try:
        lang = detect(value)

        if lang == 'en':
            doc = nlp_eng(value)
            lemmas = [token.lemma_ for token in doc if token.is_alpha and token.text not in punctuation]
            cleaned_sentence = ' '.join(lemmas)
            all_sentences.append(cleaned_sentence)
            filtered_data.loc[index, 'selected_posts'] = cleaned_sentence
        elif lang == 'ru':
            doc = nlp_rus(value)
            lemmas = [token.lemma_ for token in doc if token.is_alpha and token.text not in punctuation]
            cleaned_sentence = ' '.join(lemmas)
            all_sentences.append(cleaned_sentence)
            filtered_data.loc[index, 'selected_posts'] = cleaned_sentence
        else:
            filtered_data.loc[index, 'selected_posts'] = np.nan
    except Exception as e:
        print(f"Error processing value: {value}")
        print(f"Error message: {str(e)}")
        
filtered_data = filtered_data.dropna(subset=['selected_posts'])

print(len(all_sentences))
print(filtered_data.shape)


Error processing value: https://lnkd.in/g2ryPnc
Error message: No features in text.
Error processing value: https://lnkd.in/gkvaEp44
Error message: No features in text.
Error processing value: https://lnkd.in/eS3FUqh

https://lnkd.in/evtby-E
Error message: No features in text.
Error processing value: +
Error message: No features in text.
Error processing value: 👍🏽👍🏽
Error message: No features in text.
Error processing value: 😞
Error message: No features in text.
Error processing value: ❤️
Error message: No features in text.
Error processing value: https://lnkd.in/fkXPmBu
Error message: No features in text.
Error processing value: 😁
Error message: No features in text.
Error processing value: https://t.co/0NXqsMK256
Error message: No features in text.
Error processing value: nan
Error message: expected string or bytes-like object
Error processing value: nan
Error message: expected string or bytes-like object
Error processing value: nan
Error message: expected string or bytes-like object


# Векторизация текстов и LDA

In [9]:
tfidf = TfidfVectorizer(stop_words=stop_words, min_df=4, max_df=0.95)

In [10]:
x = tfidf.fit_transform(all_sentences)

In [11]:
id2word = {i: token for i, token in enumerate(tfidf.get_feature_names())}



In [12]:
n_topics=15

In [13]:
lda = LatentDirichletAllocation(n_components=n_topics, random_state=12345)

In [14]:
topics = lda.fit_transform(x)

In [15]:
main_topics = pd.DataFrame()

Посмотрим на ключевые слова тем

In [16]:
for i in range(n_topics):
    c = lda.components_[i, :]
    topic_tokens = [id2word.get(token_id) for token_id in np.argsort(c)[-20:]]
    print("Тема",i+1, ":")
    main_topics.loc[i,'Тема'] = i+1
    print(", ".join(topic_tokens))
    main_topics.loc[i,'Ключевые слова'] = ", ".join(topic_tokens)
    print("\n ")

Тема 1 :
institute, contact, build, strategy, view, verified, complete, obtain, achievement, initiative, cost, finance, tableau, certification, want, start, position, new, share, happy

 
Тема 2 :
система, post, облачный, аналитик, online, russian, бесплатный, neuralnetwork, experience, сервис, курс, method, open, первый, английский, время, ai, article, новый, год

 
Тема 3 :
программист, ссылка, programming, machine, language, сайт, python, dataengineere, always, datascience, sql, hr, вакансия, dataanalytic, see, linkedinskillassessment, skill, next, earn, badge

 
Тема 4 :
money, компания, much, one, life, country, поддерживать, nice, сообщать, great, начинать, university, deliver, новый, должность, junior, blockchain, good, удовольствие, developer

 
Тема 5 :
база, данных, software, всем, cloudcompute, веб, управление, компания, linkedin, openforbusiness, услуги, просмотреть, оказывать, радость, сообщать, страница, приложение, услуга, бизнес, разработка

 
Тема 6 :
leader, healthcar

Посмотрим на типичные посты в каждом топике

In [17]:
for i in range(n_topics):
    doc_id = np.argmax(topics[:, i])
    print(doc_id)
    print("Тема ", i+1)
    print(filtered_data.iloc[doc_id]["posts"])
    main_topics.loc[i,'Типичный пост'] = filtered_data.iloc[doc_id]["posts"]
    print("\n")

522
Тема  1
It’s been a while since I’ve completed my program at Bauman Moscow State Technical University, but I wanted to share this update with everyone.


1138
Тема  2
So I published my first article on arXiv https://lnkd.in/gfr6cuy!!! The article is about a new method for choosing the optimal learning neural network. From this method, the creation of a meta-learning algorithm follows! Be free to ask questions. I am open to discussing the article.#datascience #deeplearning #neuralnetwork #ai


574
Тема  3
I just earned a skill badge for MySQL! Who's next? See how you do on a #LinkedInSkillAssessment. #sql #sqlserver #sqlprogramming #dataengineering #dataanalytics #datascience #etl #datapipelines


1020
Тема  4
🚀 We just launched a six-week BNBChain Zero2Hero Bootcamp program for the Russian cohort, and it's off to a great start. Out of 642 applicants, we chose the best 200 individuals to receive expert training and mentorship from industry leaders in blockchain technology.          

In [18]:
topic_assignments = lda.transform(x)
filtered_data['topic'] = np.argmax(topic_assignments[:len(filtered_data)], axis=1) % n_topics + 1

In [19]:
filtered_data['reactions'] = pd.to_numeric(filtered_data['reactions'], errors='coerce')
top_topics = filtered_data.groupby('topic').agg({'reactions': 'sum'}).nlargest(10, 'reactions').index
filtered_data['top10topics'] = np.where(filtered_data['topic'].isin(top_topics), 'top10', '')
filtered_data['topic_reactions'] = filtered_data.groupby('topic')['reactions'].transform('sum')

In [20]:
filtered_data = filtered_data.reset_index()
display(filtered_data.head(15))

Unnamed: 0.1,index,Unnamed: 0,name,works_at,posts,reactions,current_position,ttl_duration,link,selected_posts,topic,top10topics,topic_reactions
0,0,0,ARTEM DOROFEEV,Backend senior software developer at Kaspersky...,#OpenToWork,0.0,Backend senior software developer at Kaspersky...,3561 days,https://www.linkedin.com/in/eocron/,opentowork,12,top10,1247.0
1,1,1,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,"#humour Переквалифицировался, прошел курсы IT ...",7.0,,,,humour переквалифицироваться пройти курс it пр...,2,top10,2989.0
2,2,2,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,#google #pandas #вакансия #аналитик #TeamLead ...,4.0,,,,google pandas вакансия аналитик teamlead remot...,7,top10,2243.0
3,3,3,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,Вакансия #SolutionArchitect Проекты: 1. Интегр...,9.0,,,,вакансия solutionarchitect проект интеграционн...,13,,769.0
4,4,4,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,Кто накидает мне здесь контакты аутстафф компа...,1.0,,,,кто накидает мне здесь контакт аутстафф компан...,6,,526.0
5,5,5,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,"🔍 DevOps engineer, грейд middleРазработка Data...",1.0,,,,devops engineer грейд middleразработка dataops...,7,top10,2243.0
6,6,6,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,Системный аналитик(Data/ETL)💼 Проект: разработ...,2.0,,,,системный etl проект разработка система по кли...,7,top10,2243.0
7,8,8,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,"#python #django #remote 📣 Друзья, порекомендуй...",8.0,,,,python django remote друг порекомендуйте пожал...,3,top10,808.0
8,9,9,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,#referal #senior #java #hrЕсть сильный бекенд ...,7.0,,,,referal senior java hrЕсть сильный бекенд прог...,3,top10,808.0
9,10,10,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,I’m #hiring. Know anyone who might be interested?,2.0,,,,I hire know anyone who might be interested,15,,457.0


# Повторение операций для постов, связанных с менторингом

In [21]:
mentoring_list_lemmas_eng = [
    'mentoring',
    'mentorship',
    'mentor',
    'onboarding',
    'coach',
    'coaching',
    'adaptation',
    'advise',
    'intern',
    'onboarding',
    'adaptation',
    'adviser',
    'coach',
    'mentor',
    'mentorship',
    'teach',
    'tutor'
]

mentoring_list_lemmas_rus = [
    'взять',
    'готовый',
    'готов',
    'менторство',
    'начинать',
    'прокачать',
    'прокачаться',
    'развивайся',
    'развиваться',
    'совет',
    'ученик',
    'научить',
    'научиться',
    'менторство',
    'менторинг',
    'наставничество',
    'ментор',
    'наставник',
    'наставлять'
    'онбординг',
    'адаптация',
    'коучинг',
    'коуч',
    'менторить',
    'наставлять'
]

In [22]:
mentoring_data = filtered_data.copy()

all_sentences_mentoring = []

for index in range(len(mentoring_data["posts"])):
    value = mentoring_data.loc[index, "posts"]
    try:
        lang = detect(value)

        if lang == 'en':
            doc = nlp_eng(value)
            lemmas = [token.lemma_ for token in doc if token.is_alpha]
            if any(lemma in mentoring_list_lemmas_eng for lemma in lemmas):
                cleaned_sentence = ' '.join(lemmas)
                if cleaned_sentence:
                    all_sentences_mentoring.append(cleaned_sentence)
                    mentoring_data.loc[index, 'selected_posts_mentoring'] = cleaned_sentence
        elif lang == 'ru':
            doc = nlp_rus(value)
            lemmas = [token.lemma_ for token in doc if token.is_alpha]
            if any(lemma in mentoring_list_lemmas_rus for lemma in lemmas):
                cleaned_sentence = ' '.join(lemmas)
                if cleaned_sentence:
                    all_sentences_mentoring.append(cleaned_sentence)
                    mentoring_data.loc[index, 'selected_posts_mentoring'] = cleaned_sentence
        else:
            mentoring_data.loc[index, 'selected_posts_mentoring'] = np.nan
    except Exception as e:
        print(f"Error processing value: {value}")
        print(f"Error message: {str(e)}")

mentoring_data = mentoring_data.dropna(subset=['selected_posts_mentoring'])

print(len(all_sentences_mentoring))
print(mentoring_data.shape)

91
(91, 14)


In [23]:
x = tfidf.fit_transform(all_sentences_mentoring)

In [24]:
id2word = {i: token for i, token in enumerate(tfidf.get_feature_names())}



In [25]:
n_topics=10

In [26]:
lda = LatentDirichletAllocation(n_components=n_topics, random_state=12345)

In [27]:
topics_mentoring = lda.fit_transform(x)

In [28]:
mentoring_topics = pd.DataFrame()

In [29]:
for i in range(n_topics):
    c = lda.components_[i, :]
    topic_tokens = [id2word.get(token_id) for token_id in np.argsort(c)[-20:]]
    print("Тема",i+1, ":")
    mentoring_topics.loc[i,'Тема'] = i+1
    print(", ".join(topic_tokens))
    mentoring_topics.loc[i,'Ключевые слова'] = ", ".join(topic_tokens)
    print("\n ")

Тема 1 :
experience, направление, manager, work, share, product, second, use, полный, startup, coach, country, one, training, teach, think, make, new, управление, business

 
Тема 2 :
may, start, coach, job, country, development, etc, provide, good, receive, mentorship, great, специалист, look, engineer, use, experience, data, python, developer

 
Тема 3 :
want, experience, startup, любой, teach, good, learn, run, работать, need, важный, like, work, start, опыт, технический, технология, оборудование, начинать, хороший

 
Тема 4 :
оборудование, time, начинать, условие, основный, управление, javascript, привет, всем, направление, разный, работа, искать, информация, нужный, весь, умение, developer, frontend, человек

 
Тема 5 :
мочь, ux, follow, problem, продукт, готовый, ссылка, уровень, делать, результат, свой, ui, курс, проект, задача, компания, опыт, работа, команда, это

 
Тема 6 :
сообщать, удовольствие, blockchain, весь, мочь, поиск, здравствовать, opentowork, рекомендация, совет, 

In [30]:
for i in range(n_topics):
    doc_id = np.argmax(topics_mentoring[:, i])
    print(doc_id)
    print("Тема ", i+1)
    print(mentoring_data.iloc[doc_id]["posts"])
    mentoring_topics.loc[i,'Типичный пост'] = mentoring_data.iloc[doc_id]["posts"]
    print("\n")

15
Тема  1
Have you heard of the Nail Polish Effect?Originally known as the “Lipstick Effect,” it was coined by Leonard Lauder - heir to The Estée Lauder Companies Inc.The Nail Polish Effect is the observation that affordable luxury products and services have historically been able to withstand a recession.This has been true for every major U.S. recession dating back to the 70s.In 2008, when real estate and banking institutions took a huge hit, the nail business was still growing as consumer demand actually picked up.Within the global beauty and personal care industry, economic data shows that nail polish is the fastest-growing category.Okay, why does this matter?. . . I just wanted to share my experience investing in small businesses with other small business owners.It took me a few years to hone in on the nail salon industry, and I want to share the mistakes I made so others can avoid them.I made the jump almost 2 years ago - it was tough for sure given COVID-19, but business boomed 

In [31]:
topic_assignments = lda.transform(x) 
mentoring_data['topic_mentoring'] = np.argmax(topic_assignments[:len(mentoring_data)], axis=1) % n_topics + 1

In [32]:
mentoring_data.loc[:, 'reactions'] = pd.to_numeric(mentoring_data['reactions'], errors='coerce')
mentoring_data['topic_mentoring_reactions'] = mentoring_data.groupby('topic_mentoring')['reactions'].transform('sum')

In [33]:
mentoring_data=mentoring_data.reset_index()
display(mentoring_data.head(15))

Unnamed: 0.1,level_0,index,Unnamed: 0,name,works_at,posts,reactions,current_position,ttl_duration,link,selected_posts,topic,top10topics,topic_reactions,selected_posts_mentoring,topic_mentoring,topic_mentoring_reactions
0,15,16,16,Aigerim Mautkan,AITAS KZ,"Здравствуйте, я в поиске новой карьерной возмо...",6.0,,,,здравствовать я в поиск новый карьерный возмож...,6,,526.0,здравствовать я в поиск новый карьерный возмож...,6,564.0
1,16,17,17,Aigerim Mautkan,AITAS KZ,ПОЛЕЗНЫЙ СПИСОК ДЛЯ РЕКРУТЕРОВ И ТЕХ КТО В ПОИ...,,,,,полезный список для рекрутеров и тех кто в пои...,3,top10,808.0,полезный список для рекрутеров и тех кто в пои...,7,277.0
2,18,19,19,Aigerim Mautkan,AITAS KZ,ПОЛЕЗНЫЙ СПИСОК ДЛЯ РЕКРУТЕРОВ И ТЕХ КТО В ПОИ...,,,,,полезный список для рекрутеров и тех кто в пои...,3,top10,808.0,полезный список для рекрутеров и тех кто в пои...,7,277.0
3,26,27,27,Aleksandr Drozdov,Senior Back End Software Engineer,"Здравствуйте, я в поиске новой карьерной возмо...",0.0,Senior Back End Software Engineer,10466 days,https://www.linkedin.com/in/drozdosold/,здравствовать я в поиск новый карьерный возмож...,6,,526.0,здравствовать я в поиск новый карьерный возмож...,6,564.0
4,27,28,28,Aleksandr Drozdov,Senior Back End Software Engineer,В с вязи с затруднениями в получении оборудова...,0.0,Senior Back End Software Engineer,10466 days,https://www.linkedin.com/in/drozdosold/,в с вязь с затруднение в получение оборудовани...,1,top10,2034.0,в с вязь с затруднение в получение оборудовани...,3,973.0
5,32,33,33,Aleksandr Drozdov,Senior Back End Software Engineer,"Здравствуйте, я в поиске новой карьерной возмо...",0.0,Senior Back End Software Engineer,10466 days,https://www.linkedin.com/in/drozdosold/,здравствовать я в поиск новый карьерный возмож...,6,,526.0,здравствовать я в поиск новый карьерный возмож...,6,564.0
6,33,34,34,Aleksandr Drozdov,Senior Back End Software Engineer,В с вязи с затруднениями в получении оборудова...,0.0,Senior Back End Software Engineer,10466 days,https://www.linkedin.com/in/drozdosold/,в с вязь с затруднение в получение оборудовани...,1,top10,2034.0,в с вязь с затруднение в получение оборудовани...,3,973.0
7,62,63,63,Aleksandra Belova,BI Analytics | Team Lead | Data analysis | Vis...,Why developing and business do not understand ...,2.0,,,,why develop and business do not understand eac...,14,top10,6726.0,why develop and business do not understand eac...,5,365.0
8,66,68,68,Alex Gr,Agile PM,"Здравствуйте, я в поиске новой карьерной возмо...",5.0,,,,здравствовать я в поиск новый карьерный возмож...,6,,526.0,здравствовать я в поиск новый карьерный возмож...,6,564.0
9,70,72,72,Alex Tarasov,Full Stack Web Developer,"Здравствуйте, я в поиске новой карьерной возмо...",1.0,Full Stack Web Developer,4840 days,https://www.linkedin.com/in/alex-tarasov-work/,здравствовать я в поиск новый карьерный возмож...,6,,526.0,здравствовать я в поиск новый карьерный возмож...,6,564.0


In [34]:
mentoring_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   level_0                    91 non-null     int64  
 1   index                      91 non-null     int64  
 2   Unnamed: 0                 91 non-null     int64  
 3   name                       91 non-null     object 
 4   works_at                   91 non-null     object 
 5   posts                      91 non-null     object 
 6   reactions                  80 non-null     float64
 7   current_position           23 non-null     object 
 8   ttl_duration               23 non-null     object 
 9   link                       23 non-null     object 
 10  selected_posts             91 non-null     object 
 11  topic                      91 non-null     int64  
 12  top10topics                91 non-null     object 
 13  topic_reactions            91 non-null     float64
 

# Составление финального датасета и запись необходимых документов

Присоединяем к отфильтрованному датасету колонки из менторинга

In [35]:
final_data = filtered_data.merge(mentoring_data[['posts', 'topic_mentoring', 'topic_mentoring_reactions']], on='posts', how='left')
final_data = final_data.drop_duplicates()

In [36]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1226 entries, 0 to 1793
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      1226 non-null   int64  
 1   Unnamed: 0                 1226 non-null   int64  
 2   name                       1226 non-null   object 
 3   works_at                   1226 non-null   object 
 4   posts                      1226 non-null   object 
 5   reactions                  1166 non-null   float64
 6   current_position           250 non-null    object 
 7   ttl_duration               250 non-null    object 
 8   link                       250 non-null    object 
 9   selected_posts             1226 non-null   object 
 10  topic                      1226 non-null   int64  
 11  top10topics                1226 non-null   object 
 12  topic_reactions            1226 non-null   float64
 13  topic_mentoring            91 non-null     float

Удаляем избыточные колонки

In [37]:
final_data = final_data.drop(columns=['index', 'Unnamed: 0', 'selected_posts'])

In [38]:
display(final_data.head(15))

Unnamed: 0,name,works_at,posts,reactions,current_position,ttl_duration,link,topic,top10topics,topic_reactions,topic_mentoring,topic_mentoring_reactions
0,ARTEM DOROFEEV,Backend senior software developer at Kaspersky...,#OpenToWork,0.0,Backend senior software developer at Kaspersky...,3561 days,https://www.linkedin.com/in/eocron/,12,top10,1247.0,,
1,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,"#humour Переквалифицировался, прошел курсы IT ...",7.0,,,,2,top10,2989.0,,
2,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,#google #pandas #вакансия #аналитик #TeamLead ...,4.0,,,,7,top10,2243.0,,
3,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,Вакансия #SolutionArchitect Проекты: 1. Интегр...,9.0,,,,13,,769.0,,
4,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,Кто накидает мне здесь контакты аутстафф компа...,1.0,,,,6,,526.0,,
5,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,"🔍 DevOps engineer, грейд middleРазработка Data...",1.0,,,,7,top10,2243.0,,
6,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,Системный аналитик(Data/ETL)💼 Проект: разработ...,2.0,,,,7,top10,2243.0,,
7,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,"#python #django #remote 📣 Друзья, порекомендуй...",8.0,,,,3,top10,808.0,,
8,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,#referal #senior #java #hrЕсть сильный бекенд ...,7.0,,,,3,top10,808.0,,
9,Aida Borlakova,Global IT Talent Scout |Team4You |10 000+,I’m #hiring. Know anyone who might be interested?,2.0,,,,15,,457.0,,


Сохраняем нужные файлы

In [39]:
main_topics.to_excel('main_topics.xlsx', index=False)
mentoring_topics.to_excel('mentoring_topics.xlsx', index=False)
final_data.to_excel('final_data.xlsx', index=False)
final_data.to_csv('final_data.csv', index=False)