# Кодирование распределения дней пользователей на курсе по соотношению значений таргета

Возможные варианты за первые 48 часов на курсе, что соответствует 3-м календарным дням: только первый день; первый и второй дни; первый и третий дни; первый, второй и третий дни. Итого 4 возможных варианта

Импортируем требуемые библиотеки и делаем некоторые настройки

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from collections import Counter

Кодирование для данных EVENTS

In [4]:
feature_events = pd.read_csv('DATA\\days_feature_events_all.csv')

In [5]:
target_feature = pd.read_csv('DATA\\target_feature.csv')

In [6]:
feature_events = feature_events.merge(target_feature, on='user_id',
                                                      how='outer')

In [7]:
#отбор строк, для который таргет равен 1
is_gone_true = feature_events[feature_events.is_gone == 1]

In [8]:
#отбор строк, для который таргет равен 0
is_gone_false = feature_events[feature_events.is_gone == 0]

In [9]:
q_true = Counter(is_gone_true.day_chem)
q_true#словарь, вариант - количество значений для таргета 1

Counter({2.0: 574, 3.0: 99, 1.0: 713, 4.0: 538})

In [10]:
q_false = Counter(is_gone_false.day_chem)
q_false#словарь, вариант - количество значений для таргета 0

Counter({1.0: 13512, 2.0: 2345, 4.0: 1051, 3.0: 402})

In [11]:
#расчет относительных частот
q_1 = q_true.get(1) / (q_false.get(1) + q_true.get(1))
q_2 = q_true.get(2) / (q_false.get(2) + q_true.get(2))
q_3 = q_true.get(3) / (q_false.get(3) + q_true.get(3))
q_4 = q_true.get(4) / (q_false.get(4) + q_true.get(4))

In [12]:
q_1, q_2, q_3, q_4

(0.05012302284710018,
 0.19664268585131894,
 0.19760479041916168,
 0.33857772183763374)

In [13]:
#заполняем новый столбец
feature_events.loc_day[feature_events.day_chem == 1] = q_1
feature_events.loc_day[feature_events.day_chem == 2] = q_2
feature_events.loc_day[feature_events.day_chem == 3] = q_3
feature_events.loc_day[feature_events.day_chem == 4] = q_4

In [14]:
#удаляем ненужные колонки и переименование
feature_events = feature_events.drop(['day_chem', 'is_gone'], axis=1)
feature_events = feature_events.rename(columns={'count_day': 'events_day',
                                                'loc_day': 'Eloc_day'})

Кодировка для данных SUBMISSIONS (аналогично)

In [15]:
feature_submissions = pd.read_csv('DATA\\days_feature_submissions_all.csv')

In [16]:
feature_submissions = feature_submissions.merge(target_feature,
                                                on='user_id',
                                                how='left')

In [17]:
is_gone_true = feature_submissions[feature_submissions.is_gone == 1]

In [18]:
is_gone_false = feature_submissions[feature_submissions.is_gone == 0]

In [19]:
w_true = Counter(is_gone_true.day_chem)
w_true

Counter({2.0: 627, 1.0: 821, 4.0: 370, 3.0: 106})

In [20]:
w_false = Counter(is_gone_false.day_chem)
w_false

Counter({1.0: 6076, 2.0: 1336, 3.0: 215, 4.0: 389})

In [21]:
w_1 = w_true.get(1) / (w_false.get(1) + w_true.get(1))
w_2 = w_true.get(2) / (w_false.get(2) + w_true.get(2))
w_3 = w_true.get(3) / (w_false.get(3) + w_true.get(3))
w_4 = w_true.get(4) / (w_false.get(4) + w_true.get(4))

In [22]:
w_1, w_2, w_3, w_4

(0.11903726257793243,
 0.31940906775343864,
 0.3302180685358255,
 0.4874835309617918)

In [23]:
feature_submissions.loc_day[feature_submissions.day_chem == 1] = w_1
feature_submissions.loc_day[feature_submissions.day_chem == 2] = w_2
feature_submissions.loc_day[feature_submissions.day_chem == 3] = w_3
feature_submissions.loc_day[feature_submissions.day_chem == 4] = w_4

In [24]:
feature_submissions = feature_submissions.drop(['day_chem', 'is_gone'], axis=1)
feature_submissions = feature_submissions.rename(columns={'count_day': 'submissions_day',
                                                          'loc_day': 'Sloc_day'})

Соединим EVENTS и SUBMISSIONS

In [25]:
feature_events = feature_events.merge(feature_submissions,
                                      on='user_id',
                                      how='left')

In [26]:
feature_events = feature_events.fillna(0)

Схлопнем 2 столбца

In [27]:
feature_events['loc_day'] = feature_events.Eloc_day + feature_events.Sloc_day

In [30]:
feature_events = feature_events.drop(['Eloc_day', 'Sloc_day'], axis=1)

Сохраним в файл

In [31]:
feature_events.to_csv('DATA\\days_feature_target.csv', index=False)

In [32]:
feature_events

Unnamed: 0,user_id,events_day,submissions_day,loc_day
0,1,1.0,0.0,0.050123
1,2,1.0,1.0,0.169160
2,3,1.0,1.0,0.169160
3,4,1.0,0.0,0.050123
4,5,1.0,1.0,0.169160
...,...,...,...,...
25413,26796,1.0,1.0,0.169160
25414,26797,1.0,1.0,0.169160
25415,26798,1.0,1.0,0.169160
25416,26799,2.0,1.0,0.315680


In [33]:
feature_events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25418 entries, 0 to 25417
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          25418 non-null  int64  
 1   events_day       25418 non-null  float64
 2   submissions_day  25418 non-null  float64
 3   loc_day          25418 non-null  float64
dtypes: float64(3), int64(1)
memory usage: 992.9 KB
