In [92]:
import math
import copy
import pymorphy2
import pandas_profiling
import warnings
import matplotlib.pyplot as plt
import regex as re
import numpy as np
import pandas as pd

from tempfile import mkdtemp
from shutil import rmtree
from time import time
from sklearn import preprocessing
from scipy import optimize
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from catboost import CatBoostClassifier, cv, Pool, CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [93]:
%load_ext ipython_anybar
%ipython_anybar_connect 192.168.0.102 1738

The ipython_anybar extension is already loaded. To reload it, use:
  %reload_ext ipython_anybar
Connect to 192.168.0.102:1738


In [94]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\u200d"
                           u"\u2640-\u2642" 
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [95]:
def load_data(posts_file_name='user_posts.csv', profiles_file_name='user_profiles.csv', samples=1.):
    """
    param samples: can be absolute int number or float for partition.
    
    returns posts, profiles
    """
    profiles = pd.read_csv(profiles_file_name)
    posts = pd.read_csv(posts_file_name)
    
    profiles = profiles.sample(n=samples if type(samples) is int else None,
                              frac=samples if type(samples) is float else None)
    posts = posts[posts.id.isin(profiles.id)]
    
    return posts, profiles

In [96]:
%%time
posts, profiles = load_data(samples=1.)

CPU times: user 9.43 s, sys: 2.48 s, total: 11.9 s
Wall time: 12.6 s


In [97]:
# Lets look reports for our data.
pandas_profiling.ProfileReport(profiles)

0,1
Number of variables,28
Number of observations,13543
Total Missing (%),51.4%
Total size in memory,2.6 MiB
Average record size in memory,203.0 B

0,1
Numeric,3
Categorical,19
Boolean,2
Date,0
Text (Unique),1
Rejected,3
Unsupported,0

0,1
Distinct count,1125
Unique (%),8.3%
Missing (%),91.6%
Missing (n),12412

0,1
Человек,3
"В ответ на новую политику ""ВКонтакте"" я настоящим объявляю, что все мои персональные данные, фотографии, рисунки, переписка и так далее являются объектами моего авторского права (согласно Бернской Конвенции). Для коммерческого использования всех вышеупомянутых объектов авторского права в каждом конкретном случае необходимо мое письменное разрешение.",2
,2
Other values (1121),1124
(Missing),12412

Value,Count,Frequency (%),Unnamed: 3
Человек,3,0.0%,
"В ответ на новую политику ""ВКонтакте"" я настоящим объявляю, что все мои персональные данные, фотографии, рисунки, переписка и так далее являются объектами моего авторского права (согласно Бернской Конвенции). Для коммерческого использования всех вышеупомянутых объектов авторского права в каждом конкретном случае необходимо мое письменное разрешение.",2,0.0%,
,2,0.0%,
"Да я такая странная... Кому-то долгожданная. Я умная и смелая, я без сомненья верная. Я в ком-то растворенная. С другими недотрога я. Юла я заведенная, вообще-то, очень скромная. С небес звезда сошедшая. Тайфун - я сумасшедшая. Красивая, фигурная и солнечно-я-лунная. Я тихая и милая. Конечно, я любимая. Кому-то ядовитая, но все же не забытая. Порою непутевая. И каждый день я новая. Еще принципиальная. Смешная и забавная. Для дружбы я открытая. От подлости-разбитая. Порой бываю сложная, занудно-невозможная. Как красный перец жгучая. Тропа в лесу дремучем я. Такая вся клубничная, еще я необычная. Простая и занятная, порою непонятная. Бываю я застенчива, как ветер я изменчива. Эх, очень я мечтательна. В работе я старательна. Такая вот я разная и самая прекрасная.",2,0.0%,
-,2,0.0%,
),2,0.0%,
Сайт - http://smirnovalexander.com Группа - http://vk.com/public35537183 Отзывы - https://vk.com/topic-35537183_29416686,1,0.0%,
Девочка с глазами из самого синего льда...,1,0.0%,
:),1,0.0%,
Здоровое чувство юмора и золотые руки,1,0.0%,

0,1
Distinct count,956
Unique (%),7.1%
Missing (%),92.6%
Missing (n),12540

0,1
фотограф,10
Фотограф,10
Работаю,4
Other values (952),979
(Missing),12540

Value,Count,Frequency (%),Unnamed: 3
фотограф,10,0.1%,
Фотограф,10,0.1%,
Работаю,4,0.0%,
Фотография,3,0.0%,
работа,3,0.0%,
.,3,0.0%,
работаю,3,0.0%,
Мама,2,0.0%,
Бездельник,2,0.0%,
инженер,2,0.0%,

0,1
Distinct count,3304
Unique (%),24.4%
Missing (%),61.0%
Missing (n),8258

0,1
18.6.1987,7
15.10.1985,7
20.12.1987,7
Other values (3300),5264
(Missing),8258

Value,Count,Frequency (%),Unnamed: 3
18.6.1987,7,0.1%,
15.10.1985,7,0.1%,
20.12.1987,7,0.1%,
16.3.1987,6,0.0%,
9.7.1988,6,0.0%,
23.5.1988,6,0.0%,
18.5.1989,6,0.0%,
9.9.1988,6,0.0%,
21.3.1988,6,0.0%,
24.1.1989,5,0.0%,

0,1
Distinct count,940
Unique (%),6.9%
Missing (%),92.9%
Missing (n),12577

0,1
Мастер и Маргарита,7
Библия,6
много,4
Other values (936),949
(Missing),12577

Value,Count,Frequency (%),Unnamed: 3
Мастер и Маргарита,7,0.1%,
Библия,6,0.0%,
много,4,0.0%,
нет,3,0.0%,
не читаю,2,0.0%,
Стивен Кинг,2,0.0%,
Техническая литература,2,0.0%,
люблю читать,2,0.0%,
Детективы,2,0.0%,
бумажные,2,0.0%,

0,1
Constant value,

0,1
Constant value,

0,1
Distinct count,1554
Unique (%),11.5%
Missing (%),0.0%
Missing (n),0

0,1
Александр,416
Ольга,387
Елена,377
Other values (1551),12363

Value,Count,Frequency (%),Unnamed: 3
Александр,416,3.1%,
Ольга,387,2.9%,
Елена,377,2.8%,
Екатерина,354,2.6%,
Анна,348,2.6%,
Мария,316,2.3%,
Алексей,315,2.3%,
Юлия,310,2.3%,
Дмитрий,308,2.3%,
Сергей,306,2.3%,

0,1
Distinct count,1818
Unique (%),13.4%
Missing (%),13.5%
Missing (n),1830
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,521.85
Minimum,0
Maximum,223120
Zeros (%),0.5%

0,1
Minimum,0.0
5-th percentile,19.6
Q1,88.0
Median,196.0
Q3,435.0
95-th percentile,1592.0
Maximum,223120.0
Range,223120.0
Interquartile range,347.0

0,1
Standard deviation,2826.8
Coef of variation,5.4169
Kurtosis,3578.8
Mean,521.85
MAD,549.02
Skewness,50.951
Sum,6112400
Variance,7990800
Memory size,105.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,73,0.5%,
82.0,49,0.4%,
39.0,47,0.3%,
89.0,47,0.3%,
99.0,46,0.3%,
61.0,45,0.3%,
24.0,45,0.3%,
29.0,45,0.3%,
81.0,44,0.3%,
48.0,42,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0.0,73,0.5%,
1.0,22,0.2%,
2.0,27,0.2%,
3.0,30,0.2%,
4.0,20,0.1%,

Value,Count,Frequency (%),Unnamed: 3
43182.0,1,0.0%,
52719.0,1,0.0%,
55126.0,1,0.0%,
117791.0,1,0.0%,
223122.0,1,0.0%,

0,1
Distinct count,723
Unique (%),5.3%
Missing (%),94.3%
Missing (n),12770

0,1
жизнь,7
Жизнь,6
Футбол,5
Other values (719),755
(Missing),12770

Value,Count,Frequency (%),Unnamed: 3
жизнь,7,0.1%,
Жизнь,6,0.0%,
Футбол,5,0.0%,
Прятки,5,0.0%,
футбол,4,0.0%,
-,4,0.0%,
нет,4,0.0%,
Настольные,3,0.0%,
Преферанс,3,0.0%,
подвижные,3,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.9%
Missing (n),119

0,1
True,13424
(Missing),119

Value,Count,Frequency (%),Unnamed: 3
True,13424,99.1%,
(Missing),119,0.9%,

0,1
Constant value,False

0,1
Distinct count,845
Unique (%),6.2%
Missing (%),83.8%
Missing (n),11343

0,1
Санкт-Петербург,403
Москва,292
Ленинград,114
Other values (841),1391
(Missing),11343

Value,Count,Frequency (%),Unnamed: 3
Санкт-Петербург,403,3.0%,
Москва,292,2.2%,
Ленинград,114,0.8%,
Питер,46,0.3%,
СПб,27,0.2%,
Новосибирск,19,0.1%,
Петербург,15,0.1%,
Екатеринбург,14,0.1%,
Красноярск,13,0.1%,
Мурманск,13,0.1%,

0,1
Distinct count,13543
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1858400
Minimum,24
Maximum,27113067
Zeros (%),0.0%

0,1
Minimum,24
5-th percentile,51253
Q1,304950
Median,671680
Q3,1600500
95-th percentile,9044100
Maximum,27113067
Range,27113043
Interquartile range,1295500

0,1
Standard deviation,3981300
Coef of variation,2.1423
Kurtosis,18.017
Mean,1858400
MAD,1963500
Skewness,4.1747
Sum,25168886556
Variance,15851000000000
Memory size,105.9 KiB

Value,Count,Frequency (%),Unnamed: 3
464893,1,0.0%,
1385233,1,0.0%,
541501,1,0.0%,
31546,1,0.0%,
123705,1,0.0%,
841924,1,0.0%,
897843,1,0.0%,
434993,1,0.0%,
207664,1,0.0%,
580398,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
24,1,0.0%,
27,1,0.0%,
131,1,0.0%,
259,1,0.0%,
315,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
26982337,1,0.0%,
27038335,1,0.0%,
27042003,1,0.0%,
27108042,1,0.0%,
27113067,1,0.0%,

0,1
Distinct count,13543
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6771
Minimum,0
Maximum,13542
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,677.1
Q1,3385.5
Median,6771.0
Q3,10156.0
95-th percentile,12865.0
Maximum,13542.0
Range,13542.0
Interquartile range,6771.0

0,1
Standard deviation,3909.7
Coef of variation,0.57741
Kurtosis,-1.2
Mean,6771
MAD,3385.7
Skewness,0
Sum,91699653
Variance,15286000
Memory size,105.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
2644,1,0.0%,
4743,1,0.0%,
6790,1,0.0%,
645,1,0.0%,
2692,1,0.0%,
12931,1,0.0%,
8833,1,0.0%,
10880,1,0.0%,
4727,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
13538,1,0.0%,
13539,1,0.0%,
13540,1,0.0%,
13541,1,0.0%,
13542,1,0.0%,

0,1
Distinct count,1102
Unique (%),8.1%
Missing (%),91.6%
Missing (n),12407

0,1
фотография,5
разнообразные,4
жизнь,4
Other values (1098),1123
(Missing),12407

Value,Count,Frequency (%),Unnamed: 3
фотография,5,0.0%,
разнообразные,4,0.0%,
жизнь,4,0.0%,
Разносторонние,3,0.0%,
много,3,0.0%,
спорт,3,0.0%,
.,3,0.0%,
Фотография,3,0.0%,
Жизнь,3,0.0%,
интересные,2,0.0%,

0,1
Distinct count,9124
Unique (%),67.4%
Missing (%),0.0%
Missing (n),0

0,1
Иванова,69
Иванов,57
Смирнова,50
Other values (9121),13367

Value,Count,Frequency (%),Unnamed: 3
Иванова,69,0.5%,
Иванов,57,0.4%,
Смирнова,50,0.4%,
Васильева,38,0.3%,
Смирнов,35,0.3%,
Петрова,33,0.2%,
Кузнецова,33,0.2%,
Кузнецов,31,0.2%,
Попова,30,0.2%,
Лебедева,28,0.2%,

0,1
Distinct count,1324
Unique (%),9.8%
Missing (%),87.6%
Missing (n),11863

0,1
Смирнова,22
Иванова,20
Андреева,10
Other values (1320),1628
(Missing),11863

Value,Count,Frequency (%),Unnamed: 3
Смирнова,22,0.2%,
Иванова,20,0.1%,
Андреева,10,0.1%,
Дмитриева,10,0.1%,
Алексеева,9,0.1%,
Волкова,9,0.1%,
Попова,9,0.1%,
Петрова,9,0.1%,
Михайлова,8,0.1%,
Кузнецова,7,0.1%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.20985

0,1
True,2842
(Missing),10701

Value,Count,Frequency (%),Unnamed: 3
True,2842,21.0%,
(Missing),10701,79.0%,

0,1
Distinct count,1010
Unique (%),7.5%
Missing (%),92.4%
Missing (n),12509

0,1
много,7
Любовь и голуби,3
нет,3
Other values (1006),1021
(Missing),12509

Value,Count,Frequency (%),Unnamed: 3
много,7,0.1%,
Любовь и голуби,3,0.0%,
нет,3,0.0%,
Фантастика,2,0.0%,
Эффект бабочки,2,0.0%,
Их много,2,0.0%,
Остров,2,0.0%,
Служебный роман,2,0.0%,
интересные,2,0.0%,
Советские,2,0.0%,

0,1
Distinct count,987
Unique (%),7.3%
Missing (%),92.4%
Missing (n),12515

0,1
Меломан,8
разная,6
Разная,5
Other values (983),1009
(Missing),12515

Value,Count,Frequency (%),Unnamed: 3
Меломан,8,0.1%,
разная,6,0.0%,
Разная,5,0.0%,
меломан,4,0.0%,
по настроению,4,0.0%,
Классическая,3,0.0%,
Под настроение,3,0.0%,
любая,3,0.0%,
Рок,3,0.0%,
Клубная,3,0.0%,

0,1
Distinct count,1997
Unique (%),14.7%
Missing (%),82.4%
Missing (n),11164

0,1
♥,32
Сергеевич,25
Владимирович,23
Other values (1993),2299
(Missing),11164

Value,Count,Frequency (%),Unnamed: 3
♥,32,0.2%,
Сергеевич,25,0.2%,
Владимирович,23,0.2%,
Александрович,19,0.1%,
Андреевич,14,0.1%,
Сергеевна,14,0.1%,
Александровна,14,0.1%,
★,12,0.1%,
Владимировна,12,0.1%,
♥♥♥,11,0.1%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),19.8%
Missing (n),2687

0,1
university,6972
work,3878
school,6
(Missing),2687

Value,Count,Frequency (%),Unnamed: 3
university,6972,51.5%,
work,3878,28.6%,
school,6,0.0%,
(Missing),2687,19.8%,

0,1
Distinct count,1343
Unique (%),9.9%
Missing (%),90.0%
Missing (n),12193

0,1
,3
нет,3
В конце все обязательно должно быть хорошо. Если все плохо - значит еще не конец...,3
Other values (1339),1341
(Missing),12193

Value,Count,Frequency (%),Unnamed: 3
,3,0.0%,
нет,3,0.0%,
В конце все обязательно должно быть хорошо. Если все плохо - значит еще не конец...,3,0.0%,
Все будет хорошо!!!,2,0.0%,
Никогда не сдавайся!,2,0.0%,
"Мне память честно сердцу служит, всегда шепча, что повезло, что всё ещё намного хуже, ещё херовей быть могло. (И.Губерман)",1,0.0%,
зенит чемпион!,1,0.0%,
"Встреча двух людей – это как встреча двух химических элементов. Реакция может и не произойти, но если она происходит – изменяются оба.",1,0.0%,
"Друг - это тот, кто знает о тебе все и тем не менее любит тебя.",1,0.0%,
"Достоинство человека заключается в его способности любить. Я люблю не то что красиво, а считаю красивым то, что люблю. Не стоит жалеть об ошибках, которые еще не раз хочется повторить. Судьба дает нам желаемое только тогда, когда мы уже научились обходиться без него. Не удерживай то, что уходит, не отталкивай то, что пришло. Свободен лишь тот, кто потерял всё, ради чего стоит жить. Думать о себе - не эгоизм. Тот, кто не думает о себе, вообще не способен мыслить. Зеркало, которому женщины верят больше всего, - это глаза мужчины. Если ты идешь мне навстречу - значит нам не по пути.",1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.082552

0,1
True,1118
(Missing),12425

Value,Count,Frequency (%),Unnamed: 3
True,1118,8.3%,
(Missing),12425,91.7%,

First 3 values
id570753
konkova
id3944961

Last 3 values
id3266971
masha.egorova
id725023

Value,Count,Frequency (%),Unnamed: 3
0x5555,1,0.0%,
1avhadieva1,1,0.0%,
1katusha,1,0.0%,
1nightwolf,1,0.0%,
22anya_anya,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
zvonec,1,0.0%,
zvyagina_anna,1,0.0%,
zyamik,1,0.0%,
zzakk,1,0.0%,
zzzzya,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
f,7895
m,5648

Value,Count,Frequency (%),Unnamed: 3
f,7895,58.3%,
m,5648,41.7%,

0,1
Distinct count,5755
Unique (%),42.5%
Missing (%),56.3%
Missing (n),7625

0,1
...,14
❤,12
.,10
Other values (5751),5882
(Missing),7625

Value,Count,Frequency (%),Unnamed: 3
...,14,0.1%,
❤,12,0.1%,
.,10,0.1%,
¯\_(ツ)_/¯,10,0.1%,
:),6,0.0%,
=),6,0.0%,
Carpe diem,5,0.0%,
изменить статус,5,0.0%,
💕,4,0.0%,
✌,4,0.0%,

0,1
Distinct count,592
Unique (%),4.4%
Missing (%),95.1%
Missing (n),12874

0,1
нет,17
не смотрю,9
КВН,8
Other values (588),635
(Missing),12874

Value,Count,Frequency (%),Unnamed: 3
нет,17,0.1%,
не смотрю,9,0.1%,
КВН,8,0.1%,
Что? Где? Когда?,6,0.0%,
-,6,0.0%,
Top Gear,6,0.0%,
Не смотрю,4,0.0%,
Разрушители легенд,4,0.0%,
Не смотрю телевизор,3,0.0%,
смешные,3,0.0%,

Unnamed: 0,id,first_name,last_name,screen_name,maiden_name,nickname,sex,bdate,relation,followers_count,has_photo,status,about,quotes,activities,interests,music,movies,tv,books,games,city,country,home_town,high_education,occupation,military
13162,121514,Mike,Nimoff,nimoff,,,m,7.3.1985,False,1359.0,True,,,,,,,,,,,,,,False,work,False
6889,884718,Юлия,Лагунова,yulia_a_yulia,,,f,,False,2027.0,True,https://www.instagram.com/yulia_a_yulia/,,,,,,,,,,,,,False,work,False
2675,52990,Валерий,Кутейников,valeriy_kuteynikov,,,m,13.11.1978,False,815.0,True,,,,,,,,,,,,,,False,work,False
9207,629076,Мариша,Виноградова,id629076,,,f,,False,61.0,True,,,,,,,,,,,,,,False,,False
7026,1806708,Светлана,Томашевич,id1806708,Голикова,,f,31.12.1976,False,239.0,True,"""Кто сказал тебе, что нет на свете настоящей, ...",,,,,,,,,,,,,False,work,False


In [98]:
pandas_profiling.ProfileReport(posts)

0,1
Number of variables,2
Number of observations,810929
Total Missing (%),0.0%
Total size in memory,18.6 MiB
Average record size in memory,24.0 B

0,1
Numeric,1
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,11375
Unique (%),1.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1821300
Minimum,24
Maximum,27113067
Zeros (%),0.0%

0,1
Minimum,24
5-th percentile,52366
Q1,307270
Median,682290
Q3,1649000
95-th percentile,8294100
Maximum,27113067
Range,27113043
Interquartile range,1341700

0,1
Standard deviation,3861000
Coef of variation,2.12
Kurtosis,19.093
Mean,1821300
MAD,1888200
Skewness,4.2779
Sum,1476910327240
Variance,14907000000000
Memory size,12.4 MiB

Value,Count,Frequency (%),Unnamed: 3
314465,447,0.1%,
349283,356,0.0%,
695243,342,0.0%,
1229809,299,0.0%,
738083,299,0.0%,
1370427,299,0.0%,
1897975,285,0.0%,
698722,281,0.0%,
10956223,277,0.0%,
1852934,275,0.0%,

Value,Count,Frequency (%),Unnamed: 3
24,79,0.0%,
27,13,0.0%,
259,88,0.0%,
315,89,0.0%,
351,76,0.0%,

Value,Count,Frequency (%),Unnamed: 3
26921792,88,0.0%,
26982337,18,0.0%,
27038335,74,0.0%,
27042003,11,0.0%,
27113067,25,0.0%,

0,1
Distinct count,742532
Unique (%),91.6%
Missing (%),0.0%
Missing (n),3

0,1
дома.,375
Подпишите петицию,314
Со мной всё в порядке.,280
Other values (742528),809957

Value,Count,Frequency (%),Unnamed: 3
дома.,375,0.0%,
Подпишите петицию,314,0.0%,
Со мной всё в порядке.,280,0.0%,
❤️,257,0.0%,
#copyright@bestad,252,0.0%,
на работе.,239,0.0%,
🎶,198,0.0%,
❤,192,0.0%,
😂😂😂,172,0.0%,
Настроение:,169,0.0%,

Unnamed: 0,id,text
0,581861,#паркДали #Вьетнам #Фукуок
1,581861,"Люди — не роботы, они ошибаются, внезапно рожа..."
2,581861,"Сможете определить где визуализация, а где реа..."
3,581861,Теплый свет родных домиков✨ Родные также и для...
4,581861,"Ура! Видео с [club162382455|""Архитектурных кан..."


In [99]:
def combine_data(posts, profiles, profile_columns):
    # This method concatenates posts for each user and adds fields from profiles.
    # Result df index is user id.
    def concatenator(texts):
        result = ' '.join(texts.astype(str))
        return result
    grouped_posts = posts.groupby(by='id', as_index=False).agg({'text': concatenator})
    grouped_posts = grouped_posts.set_index('id')
    profiles_fields = profiles.set_index('id').loc[:,profile_columns]
    
    result = pd.merge(profiles_fields, grouped_posts, on='id')
    return result

In [100]:
target_column_name = 'sex'
combined_data = combine_data(posts, profiles, [target_column_name])
combined_data

Unnamed: 0_level_0,sex,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
121514,m,Принес вам на ночь историю про одного неудачни...
884718,f,"Фотографии наших учеников, с нашего последнего..."
52990,m,Империя Спасибо Украинскй зомбиленд: взгляд из...
629076,f,ВНИМАНИЕ ВСЕМ ПОСТАМ!<br>Как и обещали - трети...
1806708,f,Внимание! Максимальный репост! <br>Вчера 18 но...
7628805,m,Саксофон от которого бегут мурашки по коже.. Э...
3450353,f,Stream of silens.... События - ужасные трусы: ...
1142304,m,"Смотри, что нашлось на AliExpress Какой ужас....."
1217673,f,"""Ted Toyler"" , 1920-е годы Это кто тут у нас т..."
274752,f,"""Eсли ты любишь человека таким, какой он есть,..."


In [101]:
# Clean data.

# Remove html tags.
combined_data.text.replace('<[^<]+?>', '', regex=True, inplace=True)

# Remove tags started from '#'.
combined_data.text.replace('#[\S]+', '', regex=True, inplace=True)

# Remove contacts links: '[id38042050|Анастасия Темрязанская]'.
combined_data.text.replace('\[[^<]+?\]', '', regex=True, inplace=True)

# Remove emoji.
combined_data.text = combined_data.text.apply(remove_emoji)

combined_data

Unnamed: 0_level_0,sex,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
121514,m,Принес вам на ночь историю про одного неудачни...
884718,f,"Фотографии наших учеников, с нашего последнего..."
52990,m,Империя Спасибо Украинскй зомбиленд: взгляд из...
629076,f,ВНИМАНИЕ ВСЕМ ПОСТАМ!Как и обещали - третий тр...
1806708,f,Внимание! Максимальный репост! Вчера 18 ноября...
7628805,m,Саксофон от которого бегут мурашки по коже.. Э...
3450353,f,Stream of silens.... События - ужасные трусы: ...
1142304,m,"Смотри, что нашлось на AliExpress Какой ужас....."
1217673,f,"""Ted Toyler"" , 1920-е годы Это кто тут у нас т..."
274752,f,"""Eсли ты любишь человека таким, какой он есть,..."


In [102]:
# Normalization.
# Here we will write 2 methods for tokenization->remove stop words/punctuations/emoji->stem/lem->join to string.
# Using these methods we make 2 new columns for further processing.

def tokenize(text):
    words = word_tokenize(text)
    return words

def remove_stopwords(words):
    stop_words = stopwords.words('russian')
    cleared_words = [word for word in words if word not in stop_words and len(word) > 2]
    return cleared_words

russian_stemmer = RussianStemmer()
def word_stem(word):
    stem = russian_stemmer.stem(word)
    return stem

pymorph = pymorphy2.MorphAnalyzer()
def word_lemma(word):
    lemma = pymorph.parse(word)[0]
    return lemma.normal_form

def normalize(text, norm_func):
    words = tokenize(text)
    without_stopwords = remove_stopwords(words)
    stems = [norm_func(word) for word in without_stopwords]
    joined = ' '.join(stems)
    return joined

def normalize_by_stems(text):
    result = normalize(text, word_stem)
    return result
    
def normalize_by_lemms(text):
    result = normalize(text, word_lemma)
    return result

In [103]:
combined_data['text_stems'] = combined_data.text.apply(normalize_by_stems)
combined_data['text_lemms'] = combined_data.text.apply(normalize_by_lemms)

In [104]:
combined_data

Unnamed: 0_level_0,sex,text,text_stems,text_lemms
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
121514,m,Принес вам на ночь историю про одного неудачни...,принес ноч истор одн неудачник телефон виде от...,принести ночь история один неудачник телефон в...
884718,f,"Фотографии наших учеников, с нашего последнего...",фотограф наш ученик наш последн занят базов мо...,фотография наш ученик наш последний занятие ба...
52990,m,Империя Спасибо Украинскй зомбиленд: взгляд из...,импер спасиб украинск зомбиленд взгляд изнутр ...,империя спасибо украинскть зомбиленд взгляд из...
629076,f,ВНИМАНИЕ ВСЕМ ПОСТАМ!Как и обещали - третий тр...,вниман всем пост как обеща трет трек грядущ ал...,внимание весь пост как обещать три трек грядущ...
1806708,f,Внимание! Максимальный репост! Вчера 18 ноября...,вниман максимальн репост вчер ноябр стадион пе...,внимание максимальный репост вчера ноябрь стад...
7628805,m,Саксофон от которого бегут мурашки по коже.. Э...,саксофон котор бегут мурашк коже.. эт музык ле...,саксофон который бежать мурашка коже.. это муз...
3450353,f,Stream of silens.... События - ужасные трусы: ...,Stre silens ... событ ужасн трус случа одн сби...,stream silens ... событие ужасный трус случать...
1142304,m,"Смотри, что нашлось на AliExpress Какой ужас.....",смотр нашл AliExpress как ужас ... оригина htt...,смотреть найтись aliexpress какой ужас ... ори...
1217673,f,"""Ted Toyler"" , 1920-е годы Это кто тут у нас т...",Ted Toyler 1920-е год эт жалк лисенок STEIFF п...,ted toyler 1920-е год это жалкий лисёнок steif...
274752,f,"""Eсли ты любишь человека таким, какой он есть,...",Eсл люб человек так люб Eсл пыта кардинальн ме...,eслить любить человек такой любить eслить пыта...


In [105]:
"""
Next we will compare 5 models:
1. stems + CountVectorizer + MultinomialNB
2. stems + TfidfVectorizer + MultinomialNB
3. lemmas + CountVectorizer + MultinomialNB
4. lemmas + TfidfVectorizer + MultinomialNB

All models we will implement using sklearn. 
We will search best model params on partial dataset and then make cross validation score on full dataset.
"""

'\nNext we will compare 5 models:\n1. stems + CountVectorizer + MultinomialNB\n2. stems + TfidfVectorizer + MultinomialNB\n3. lemmas + CountVectorizer + MultinomialNB\n4. lemmas + TfidfVectorizer + MultinomialNB\n\nAll models we will implement using sklearn. \nWe will search best model params on partial dataset and then make cross validation score on full dataset.\n'

In [106]:
# Prepare datasets for params searching and testing.
df_train, df_test = train_test_split(combined_data, test_size=0.5)
_, df_train_for_params_searching = train_test_split(df_train, test_size=0.01)

In [107]:
def find_best_model(df, column_name, vectorizer):
    cachedir = mkdtemp()
    pipeline = Pipeline([
        vectorizer,
        ('cls', MultinomialNB())
    ], memory=cachedir)

    parameters = {
        'vect__min_df': [1, 5, 0.1],
        'vect__max_df': [0.5, 0.75, 1.],
        'vect__ngram_range': [(1, 1), (1, 2)],
    }
    
    grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
    t0 = time()    
    grid_search.fit(df.loc[:, column_name], df['sex'])
    print("done in %0.3fs" % (time() - t0))
    rmtree(cachedir)
    
    return grid_search.best_estimator_

In [108]:
# Search best params for models.
column_names = ['text_stems', 'text_lemms']
vectorizers = [('CountVectorizer', ('vect', CountVectorizer())), ('TfidfVectorizer', ('vect', TfidfVectorizer()))]
best_models = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    for column_name in column_names:
        for vectorizer_title, vectorizer in vectorizers:
            best_model = find_best_model(df_train_for_params_searching, column_name, vectorizer)
            best_models[(column_name, vectorizer_title)] = best_model

done in 164.079s
done in 154.279s
done in 150.999s
done in 157.552s


In [109]:
# Compare models.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    t0 = time()    
    for ((column_name, vectorizer_title), model) in best_models.items():
        model.fit(df_train[column_name], df_train['sex'])
        y_pred = model.predict(df_test[column_name])
        y_test = df_test['sex']
        score = accuracy_score(y_pred, y_test)
        print(f'{column_name} {vectorizer_title}: {score}')
    
    print("done in %0.3fs" % (time() - t0))

Pipeline(memory='/var/folders/9_/w6j66k193r3d0n6vsctvbjj00000gn/T/tmp6624eu34',
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

text_stems CountVectorizer: 0.7790084388185654


Pipeline(memory='/var/folders/9_/w6j66k193r3d0n6vsctvbjj00000gn/T/tmpew190b_7',
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...rue,
        vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

text_stems TfidfVectorizer: 0.5896624472573839


Pipeline(memory='/var/folders/9_/w6j66k193r3d0n6vsctvbjj00000gn/T/tmpvxrjhy1h',
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

text_lemms CountVectorizer: 0.7533403656821378


Pipeline(memory='/var/folders/9_/w6j66k193r3d0n6vsctvbjj00000gn/T/tmpctq4evz6',
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...rue,
        vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

text_lemms TfidfVectorizer: 0.5896624472573839
done in 434.946s


In [None]:
# From above scores best model for user sex prediction is applying CountVectorizer on stems.