In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from preprocess import preprocess_train_test

In [None]:
# from pandas_profiling import ProfileReport

# profile = ProfileReport(train_test, title="Pandas Profiling Report")
# profile.to_widgets()

In [2]:
raw_data_path = Path('../data/raw')
train_df = pd.read_csv(raw_data_path / 'train.csv', sep=';')
print(len(train_df))
train_df.head()

306270


Unnamed: 0,id,position,region,industry,locality,locality_name,education_type,drive_licences,citizenship,schedule,...,relocation_ready,travel_ready,retraining_ready,is_worldskills_participant,has_qualifications,completeness_rate,creation_date,modification_date,publish_date,salary
0,0,Специалист,Тульская область,"Государственная служба, некоммерческие организ...",7100000100000,Тула,Незаконченное высшее,[B],Российская Федерация,Полный рабочий день,...,False,False,True,,,64.0,2020-05-07,2020-05-08,2020-05-07,37500
1,1,Лаборант,Алтайский край,"Государственная служба, некоммерческие организ...",2200000100000,Барнаул,Высшее,[B],Российская Федерация,Полный рабочий день,...,False,True,True,,,88.0,2020-10-21,2020-10-27,2020-10-27,14000
2,3,"Специалист, администратор, бухгалтер, экономист",Ульяновская область,"Административная работа, секретариат, АХО",7300000200000,Димитровград,Высшее,[B],Российская Федерация,Полный рабочий день,...,False,,True,,,78.0,2019-04-25,2020-02-04,2020-02-04,24600
3,4,Специалист,Свердловская область,"Государственная служба, некоммерческие организ...",6600002300000,Нижний Тагил,Высшее,[B],Российская Федерация,Полный рабочий день,...,False,False,False,,,91.0,2020-09-19,2020-09-20,2020-09-19,16075
4,5,Инструктор по плаванию,Иркутская область,"Здравоохранение, спорт, красота, социальное об...",3800000400000,Ангарск,,,Российская Федерация,Полный рабочий день,...,False,,,,,58.0,2020-04-20,2020-09-09,2020-04-21,22080


In [3]:
test_df = pd.read_csv(raw_data_path / 'test.csv', sep=';')
print(len(test_df))
test_df.head()

131259


Unnamed: 0,id,position,region,industry,locality,locality_name,education_type,drive_licences,citizenship,schedule,...,salary_desired,relocation_ready,travel_ready,retraining_ready,is_worldskills_participant,has_qualifications,completeness_rate,creation_date,modification_date,publish_date
0,2,инженер,Алтайский край,"Строительство, ремонт, стройматериалы, недвижи...",2200000100000,Барнаул,Высшее,[B],Российская Федерация,Полный рабочий день,...,10288,False,True,True,,,86.0,2017-08-26,2019-05-17,2016-05-11
1,6,монтажник,Свердловская область,"Строительство, ремонт, стройматериалы, недвижи...",6600000000000,Свердловская,,,Российская Федерация,Полный рабочий день,...,56649,,True,True,,,47.0,2017-12-01,2019-07-10,2017-12-01
2,7,Инженер,Ульяновская область,"Транспорт, автобизнес, логистика, склад, ВЭД",7300300002000,Измайлово,Высшее,"[A, B, C]",Российская Федерация,Полный рабочий день,...,50000,False,False,True,,,66.0,2020-04-12,2020-04-14,2020-04-14
3,11,Юрист,Краснодарский край,Юриспруденция,2300000000000,Краснодарский,Высшее,,Российская Федерация,Полный рабочий день,...,20760,False,,True,,,69.0,2019-10-10,2020-08-25,2019-10-11
4,16,Персональный менеджер,Московская область,"Финансы, кредит, страхование, пенсионное обесп...",5001300100000,Красногорск,Высшее,,Российская Федерация,Полный рабочий день,...,70000,False,False,True,,,65.0,2020-04-20,2020-05-21,2020-05-21


In [4]:
train_ids = train_df['id'].tolist()
test_ids = test_df['id'].tolist()

In [5]:
train_test = pd.concat((train_df, test_df), ignore_index=True)

In [6]:
train_test = preprocess_train_test(train_test)

Position preprocessed
Region preprocessed
Industry preprocessed
Locality name preprocessed
Education preprocessed
Drive licenses preprocessed
Citizenship preprocessed
Schedule preprocessed
Employement preprocessed
Age preprocessed
Gender preprocessed
Experience preprocessed
Salary desired preprocessed
Relocation ready preprocessed
Travel ready preprocessed
Retraining ready preprocessed
Is worldskills participant preprocessed
Has qualifications preprocessed
Creation date preprocessed
Modification date preprocessed
Publish date preprocessed
Days between preprocessed


In [7]:
train_test['publish_year'] = train_test['publish_date'].dt.year

In [8]:
resourse_prices = pd.read_csv('../data/external/mineral_resources_price_history.csv', parse_dates=['date'], index_col='date')
resourse_prices = resourse_prices.reindex(pd.date_range(resourse_prices.index.min(), resourse_prices.index.max(), freq='1D', name='date'))
resourse_prices = resourse_prices.fillna(method='ffill').reset_index()

In [9]:
train_test = train_test.merge(resourse_prices, how='left', left_on='publish_date', right_on='date').drop('date', axis=1)

In [10]:
covid_daily = pd.read_csv('../data/external/covid_by_date.csv', parse_dates=['date'], index_col='date')
covid_daily = covid_daily.reindex(pd.date_range(train_test['publish_date'].min(), covid_daily.index.max(), freq='1D', name='date'))
covid_daily = covid_daily.fillna(0).astype('int').reset_index()

In [11]:
train_test = train_test.merge(covid_daily, how='left', left_on='publish_date', right_on='date').drop('date', axis=1)

In [12]:
covid_regions = pd.read_csv('../data/external/covid_by_date_and_region.csv', parse_dates=['date'])
covid_regions['region'] = covid_regions['region'].str.lower()

In [13]:
train_test = train_test.merge(covid_regions, how='left', left_on=['publish_date', 'region'], right_on=['date', 'region'], suffixes=['', '_region'])
train_test = train_test.drop('date', axis=1)

In [14]:
cov_region_cols = [col for col in train_test.columns if col.endswith('_region')]
train_test[cov_region_cols] = train_test[cov_region_cols].fillna(0).astype('int')

In [15]:
train_test['region'] = train_test['region'].astype('category')

In [16]:
economic_indicators = pd.read_csv('../data/external/economic_indicators.csv')

In [17]:
train_test = train_test.merge(economic_indicators, how='left', left_on='publish_year', right_on='year').drop('year', axis=1)

In [18]:
employements = pd.read_pickle('../data/interim/employements_aggregated.pkl')

In [19]:
employements = employements[['id', 'mean_work_duration', 'max_work_duration', 'min_work_duration', 'median_work_duration']]

In [20]:
train_test = train_test.merge(employements, how='left', on='id')

In [21]:
train_test['mean_work_duration'] = train_test['mean_work_duration'].fillna(0)
train_test['max_work_duration'] = train_test['max_work_duration'].fillna(0)
train_test['min_work_duration'] = train_test['min_work_duration'].fillna(0)
train_test['median_work_duration'] = train_test['median_work_duration'].fillna(0)

In [22]:
education = pd.read_csv('../data/raw/education.csv', sep=';')

In [23]:
education['institution'] = education['institution'].str.lower().str.replace('\"', '').str.replace('№', '')
education['institution'] = education['institution'].str.replace('(', '').str.replace(')', '')
education['institution'] = education['institution'].fillna('NA_category')
education['graduation_year'] = education['graduation_year'].fillna(0).astype('int')
education['years_from_graduation'] = 2020 - education['graduation_year']
education = education.drop('description', axis=1)
education['is_school'] = education['institution'].str.contains('школа') | education['institution'].str.contains('сош')
education['is_university'] = education['institution'].str.contains('университет')
education['is_uchilishe'] = education['institution'].str.contains('училище')

In [24]:
train_test = train_test.merge(education, how='left', on='id')

In [25]:
train_test['graduation_year'] = train_test['graduation_year'].fillna(0).astype('int')
train_test['institution'] = train_test['institution'].fillna('NA_category').astype('category')
train_test['years_from_graduation'] = train_test['years_from_graduation'].fillna(0).astype('int')
train_test['is_school'] = train_test['is_school'].fillna(False).astype('bool')
train_test['is_university'] = train_test['is_university'].fillna(False).astype('bool')
train_test['is_uchilishe'] = train_test['is_uchilishe'].fillna(False).astype('bool')

In [26]:
train_test

Unnamed: 0,id,region,industry,locality_name,education_type,citizenship,employement_type,age,gender,experience,...,mean_work_duration,max_work_duration,min_work_duration,median_work_duration,graduation_year,institution,years_from_graduation,is_school,is_university,is_uchilishe
0,0,тульская область,"государственная служба, некоммерческие организ...",тула,незаконченное высшее,российская федерация,полная занятость,21,мужской,3,...,304.750000,730.0,31.0,229.0,2022,тульский государственный университет,-2,False,True,False
1,1,алтайский край,"государственная служба, некоммерческие организ...",барнаул,высшее,российская федерация,полная занятость,26,женский,4,...,0.000000,0.0,0.0,0.0,2016,алтайский государственный аграрный университет,4,False,True,False
2,3,ульяновская область,"административная работа, секретариат, ахо",димитровград,высшее,российская федерация,полная занятость,36,женский,12,...,994.666667,1645.0,638.0,701.0,2003,гоу спо димитровградский технический колледж,17,False,False,False
3,4,свердловская область,"государственная служба, некоммерческие организ...",нижний тагил,высшее,российская федерация,полная занятость,34,женский,9,...,1086.333333,1889.0,550.0,820.0,2005,нижнетагильский торгово-экономический техникум,15,False,False,False
4,5,иркутская область,"здравоохранение, спорт, красота, социальное об...",ангарск,NA_category,российская федерация,полная занятость,25,мужской,1,...,228.000000,305.0,151.0,228.0,0,NA_category,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437524,437515,новосибирская область,"безопасность, службы охраны",лянино,среднее,российская федерация,полная занятость,40,мужской,2,...,670.000000,670.0,670.0,670.0,1996,лянинская средняя общеобразовательная школа,24,True,False,False
437525,437518,республика дагестан,"строительство, ремонт, стройматериалы, недвижи...",NA_category,среднее,российская федерация,полная занятость,42,мужской,0,...,0.000000,0.0,0.0,0.0,1994,сош 4 г буйнакска,26,True,False,False
437526,437526,г. москва,начал трудовую деятельность,москва,высшее,российская федерация,полная занятость,27,женский,0,...,0.000000,0.0,0.0,0.0,2015,сгэу,5,False,False,False
437527,437527,свердловская область,рабочие специальности,нижний тагил,среднее,российская федерация,полная занятость,24,мужской,1,...,184.000000,184.0,184.0,184.0,2014,мбоу сош 64,6,True,False,False


In [27]:
embeddings = pd.read_pickle('../data/interim/employements_mult_new_ft_1.pkl')
embeddings = embeddings[~embeddings['id'].isna()]
embeddings['id'] = embeddings['id'].astype('int')

In [28]:
mean_embeddings = embeddings.groupby('id').mean().reset_index()

In [29]:
mean_embeddings

Unnamed: 0,id,employer_0,employer_1,employer_2,employer_3,employer_4,employer_5,employer_6,employer_7,employer_8,...,position_clean_90,position_clean_91,position_clean_92,position_clean_93,position_clean_94,position_clean_95,position_clean_96,position_clean_97,position_clean_98,position_clean_99
0,0,-1.017649,-0.447355,-1.514316,0.469207,-0.475916,-0.219186,-0.025919,-0.284017,0.213968,...,0.096677,0.002415,-0.072459,0.234490,-0.031559,0.010704,0.224756,-0.269307,0.012200,0.060977
1,1,-0.465040,0.132992,-0.815174,-0.046692,-0.829418,-0.405717,-0.056264,-0.344829,0.387615,...,0.439040,-0.134374,0.252825,0.568209,0.062679,-0.015571,0.246802,-0.486952,-0.189767,-0.224585
2,2,-0.992502,-0.152183,-0.535515,0.536152,0.228275,0.364199,-0.302803,-0.190838,0.206788,...,-0.002466,-0.391971,0.727872,0.324384,-0.107194,0.399409,0.256191,0.033960,0.298979,-0.091263
3,3,-1.185013,-0.385107,-0.751889,0.312636,-0.224279,-0.002439,-0.121650,0.048960,0.366831,...,0.396558,-0.133001,0.204240,0.192432,0.111202,-0.139169,0.213888,-0.097405,0.324649,-0.121138
4,4,-0.879738,-0.233525,-1.089134,0.361406,-0.200694,0.042866,-0.060856,-0.311024,0.200314,...,0.299617,0.093095,0.103868,0.271718,-0.017082,-0.110036,0.154896,-0.185698,0.103765,0.051490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343350,437521,-0.543067,-0.101195,-0.682919,-0.847683,-1.193004,-0.180839,0.007399,0.981701,0.306855,...,0.246862,-0.211687,0.145969,0.353881,-0.005096,-0.121159,0.087376,-0.048388,-0.097330,-0.110976
343351,437522,-1.165832,0.001414,-1.006713,0.513116,-0.042087,0.022525,-0.396497,-0.187312,-0.149837,...,0.337600,0.051778,0.278678,0.263065,-0.176482,-0.130657,0.127289,0.058634,0.216327,0.120358
343352,437524,-0.815103,0.064995,-0.662187,0.390056,-0.769040,-0.307043,-0.368575,-0.290121,-0.145781,...,0.140298,0.131908,0.124283,0.416084,0.125908,-0.068154,0.118247,-0.269563,0.098107,0.209056
343353,437525,-0.895925,0.187234,-0.374537,-0.122463,-0.627492,0.159512,-0.127963,-0.899920,0.176098,...,0.484242,-0.059477,0.218753,-0.158238,0.333149,0.021649,0.609135,-0.222898,-0.264985,0.041753


In [33]:
train_test = train_test.merge(mean_embeddings, how='left', on='id')

In [34]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 437529 entries, 0 to 437528
Columns: 470 entries, id to position_clean_99
dtypes: bool(17), category(14), datetime64[ns](1), float32(400), float64(15), int64(23)
memory usage: 827.0 MB


In [35]:
train = train_test[train_test['id'].isin(train_ids)].reset_index(drop=True)
test = train_test[train_test['id'].isin(test_ids)].reset_index(drop=True)
test = test.drop('salary', axis=1)

In [36]:
train.to_pickle('../data/preprocessed/train_final.pkl')
test.to_pickle('../data/preprocessed/test_final.pkl')