In [1]:
import pandas as pd
import pymorphy2
import re
from tqdm import tqdm

In [2]:
df = pd.read_csv('employements_mult.csv', sep=';')
df.head(2)

Unnamed: 0,id,position,employer,achievements,responsibilities,start_date,finish_date
0,0.0,специалист,"АО ""РТК""",,"<p>Консультирование клиентов и продажа товара,...",2017-06-01,2019-06-01
1,0.0,специалист,"ООО ""Евросеть-Ритейл""",,"<p>Консультирование клиентов и продажа товара,...",2017-05-01,2017-06-01


In [3]:
lemmatizer = pymorphy2.MorphAnalyzer()


def preprocess_text(s, min_length=4):
    without_tags = re.sub('<.*?>', ' ', s).lower()
    tokenized =  re.findall("[\w']+", without_tags)
    filtered_by_length = list(filter(lambda x: len(x) >= min_length, tokenized))
#     norm_tokens = [lemmatizer.parse(x)[0].normal_form for x in filtered_by_length]
    
    return filtered_by_length

---

In [4]:
df['responsibilities_tokens'] = df['responsibilities'].fillna('').apply(preprocess_text)

all_words = []
for x in df['responsibilities_tokens'].values:
    all_words.extend(x)
    
lemmatized_tokens = dict()
unique_words = set(all_words)
for word in tqdm(unique_words):
    lemmatized_tokens[word] = lemmatizer.parse(word)[0].normal_form
    
    
def lemmatize_list(s, rules=lemmatized_tokens):
    result = ' '.join([rules.get(x, ' ') for x in s])
    return result


df['responsibilities'] = df['responsibilities_tokens'].apply(lemmatize_list)
df = df.drop(columns=['responsibilities_tokens'])

In [9]:
df.head()

Unnamed: 0,id,position,employer,achievements,responsibilities,start_date,finish_date
0,0.0,специалист,"АО ""РТК""",,консультирование клиент продажа товар работа д...,2017-06-01,2019-06-01
1,0.0,специалист,"ООО ""Евросеть-Ритейл""",,консультирование клиент продажа товар работа д...,2017-05-01,2017-06-01
2,0.0,,"АО ""МегаФон Ритейл""",,консультирование клиент продажа товар работа д...,2019-06-01,2019-11-01
3,0.0,,"ПАО ""МегаФон""",,консультирование клиент телефон продажа продук...,2019-06-01,2020-04-01
4,1.0,Копирайтер/рерайтер,WorkHardOnline,,фриланс nbsp копирайтинг рерайтинг стать разли...,2019-10-01,


---

In [10]:
df['achievements_tokens'] = df['achievements']\
    .fillna('')\
    .apply(lambda x: preprocess_text(x, 3))


all_words = []
for x in df['achievements_tokens'].values:
    all_words.extend(x)

lemmatized_tokens = dict()
unique_words = set(all_words)
for word in tqdm(unique_words):
    lemmatized_tokens[word] = lemmatizer.parse(word)[0].normal_form
    
df['achievements'] = df['achievements_tokens'].apply(lemmatize_list)
df = df.drop(columns=['achievements_tokens'])

100%|██████████| 135781/135781 [00:37<00:00, 3614.74it/s]


In [11]:
df['position_tokens'] = df['position']\
    .fillna('')\
    .apply(lambda x: preprocess_text(x, 3))


all_words = []
for x in df['position_tokens'].values:
    all_words.extend(x)

lemmatized_tokens = dict()
unique_words = set(all_words)
for word in tqdm(unique_words):
    lemmatized_tokens[word] = lemmatizer.parse(word)[0].normal_form
    
df['position_clean'] = df['position_tokens'].apply(lemmatize_list)
df = df.drop(columns=['position_tokens'])

100%|██████████| 46807/46807 [00:16<00:00, 2795.83it/s]


In [12]:
df['employer_tokens'] = df['employer']\
    .fillna('')\
    .apply(lambda x: preprocess_text(x, 2))


all_words = []
for x in df['employer_tokens'].values:
    all_words.extend(x)

lemmatized_tokens = dict()
unique_words = set(all_words)
for word in tqdm(unique_words):
    lemmatized_tokens[word] = word  #lemmatizer.parse(word)[0].normal_form
    
df['employer_clean'] = df['employer_tokens'].apply(lemmatize_list)
df = df.drop(columns=['employer_tokens'])

100%|██████████| 198585/198585 [00:00<00:00, 1185914.50it/s]


In [13]:
df.to_csv('employements_mult_new.csv', sep=';', index=False)