# По действиям пользователя в первые два дня, проведенных на курсе, необходимо предсказать, сможет ли пользователь успешно закончить этот курс.

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score

#### Импортируем и обрабатываем данные

In [2]:
submissions_data_train = pd.read_csv('data/submissions_data_train.csv')
event_data_train = pd.read_csv('data/event_data_train.csv')

#### Отберем информацию об активности для каждого юзера в первые два дня (event_data_train)

In [3]:
event_data_train.head()

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632
3,32811,1434340895,discovered,17632
4,32811,1434340895,viewed,17632


In [4]:
event_user_min_timestamp = event_data_train.groupby('user_id', as_index=False).agg({'timestamp': 'min'}) \
    .rename(columns={'timestamp': 'min_timestamp'})

time_threshold = 2 * 24 * 60 * 60

event_user_min_timestamp['time_gap'] = event_user_min_timestamp.min_timestamp + time_threshold
event_data_train = event_data_train.merge(event_user_min_timestamp[['user_id', 'time_gap']])

In [5]:
first_event_data = event_data_train[event_data_train.timestamp <= event_data_train.time_gap]

In [6]:
# считаем количество тех или иных действий, совершенных каждым юзером в первые 2 дня

first_event_data = first_event_data.pivot_table(values='step_id',
                             index='user_id',
                             columns='action',
                             aggfunc='count').fillna(0).reset_index()

#### Определим юзеров, которые успешно закончили курс (passed > 40)

In [7]:
event_data_train.head()

Unnamed: 0,step_id,timestamp,action,user_id,time_gap
0,32815,1434340848,viewed,17632,1434513648
1,32815,1434340848,passed,17632,1434513648
2,32815,1434340848,discovered,17632,1434513648
3,32811,1434340895,discovered,17632,1434513648
4,32811,1434340895,viewed,17632,1434513648


In [8]:
event_actions_data_train = event_data_train.pivot_table(values='step_id',
                             index='user_id',
                             columns='action',
                             aggfunc='count').fillna(0).reset_index()

In [9]:
event_actions_data_train.head()

action,user_id,discovered,passed,started_attempt,viewed
0,1,1.0,0.0,0.0,1.0
1,2,9.0,9.0,2.0,10.0
2,3,91.0,87.0,30.0,192.0
3,5,11.0,11.0,4.0,12.0
4,7,1.0,1.0,0.0,1.0


In [10]:
event_actions_data_train['finished'] = event_actions_data_train.passed > 40
event_actions_data_train.finished = event_actions_data_train.finished.map(int)

In [11]:
event_actions_data_train

action,user_id,discovered,passed,started_attempt,viewed,finished
0,1,1.0,0.0,0.0,1.0,0
1,2,9.0,9.0,2.0,10.0,0
2,3,91.0,87.0,30.0,192.0,1
3,5,11.0,11.0,4.0,12.0,0
4,7,1.0,1.0,0.0,1.0,0
...,...,...,...,...,...,...
19229,26790,8.0,8.0,1.0,9.0,0
19230,26793,1.0,0.0,1.0,1.0,0
19231,26794,69.0,69.0,34.0,180.0,1
19232,26797,10.0,10.0,2.0,13.0,0


In [12]:
train_data = first_event_data.merge(event_actions_data_train[['user_id', 'finished']])

#### Отбираем сабмишены каждого юезра за первые два дня (submissions_data_train)

In [13]:
submissions_data_train.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1434349275,correct,15853
1,31972,1434348300,correct,15853
2,31972,1478852149,wrong,15853
3,31972,1478852164,correct,15853
4,31976,1434348123,wrong,15853


In [14]:
submission_first_data = submissions_data_train.groupby('user_id', as_index=False) \
    .agg({'timestamp': 'min'}).fillna(0) \
    .rename(columns={'timestamp': 'min_timestamp'})

In [15]:
submission_first_data['time_gap'] = submission_first_data.min_timestamp + time_threshold
submissions_data_train = submissions_data_train.merge(submission_first_data[['user_id', 'time_gap']])

In [16]:
first_submissions_data_train = submissions_data_train.pivot_table(values='step_id',
                                   index='user_id',
                                   columns='submission_status', 
                                   aggfunc='count').reset_index().fillna(0)

In [17]:
first_submissions_data_train

submission_status,user_id,correct,wrong
0,2,2.0,0.0
1,3,29.0,23.0
2,5,2.0,2.0
3,8,9.0,21.0
4,14,0.0,1.0
...,...,...,...
9935,26787,3.0,0.0
9936,26790,1.0,0.0
9937,26794,33.0,9.0
9938,26797,2.0,0.0


In [18]:
# Полная таблица со всеми данными для обучения

train_data = train_data.merge(first_submissions_data_train[['user_id', 'correct', 'wrong']], how='outer') \
    .fillna(0)

In [19]:
train_data

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,finished,correct,wrong
0,1,1.0,0.0,0.0,1.0,0,0.0,0.0
1,2,9.0,9.0,2.0,9.0,0,2.0,0.0
2,3,15.0,15.0,4.0,20.0,1,29.0,23.0
3,5,1.0,1.0,0.0,1.0,0,2.0,2.0
4,7,1.0,1.0,0.0,1.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...
19229,26790,2.0,2.0,0.0,2.0,0,1.0,0.0
19230,26793,1.0,0.0,1.0,1.0,0,0.0,0.0
19231,26794,50.0,50.0,24.0,90.0,1,33.0,9.0
19232,26797,10.0,10.0,2.0,10.0,0,2.0,0.0


#### Начинаем обучение модели

In [20]:
train_data

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,finished,correct,wrong
0,1,1.0,0.0,0.0,1.0,0,0.0,0.0
1,2,9.0,9.0,2.0,9.0,0,2.0,0.0
2,3,15.0,15.0,4.0,20.0,1,29.0,23.0
3,5,1.0,1.0,0.0,1.0,0,2.0,2.0
4,7,1.0,1.0,0.0,1.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...
19229,26790,2.0,2.0,0.0,2.0,0,1.0,0.0
19230,26793,1.0,0.0,1.0,1.0,0,0.0,0.0
19231,26794,50.0,50.0,24.0,90.0,1,33.0,9.0
19232,26797,10.0,10.0,2.0,10.0,0,2.0,0.0


In [21]:
X = train_data.set_index('user_id')
X = X.drop('finished', axis=1)
y = train_data.finished

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [23]:
rf_clf = RandomForestClassifier()
parameters = {
    'n_estimators': range(2, 50),
    'max_depth': range(2, 50),
    'min_samples_split': range(1, 100, 5),
    'min_samples_leaf': range(1, 100, 5)
}

In [24]:
rand_search_cv = RandomizedSearchCV(rf_clf, parameters, cv=10)

In [25]:
rand_search_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': range(2, 50),
                                        'min_samples_leaf': range(1, 100, 5),
                                        'min_samples_split': range(1, 100, 5),
                                        'n_estimators': range(2, 50)})

In [26]:
best_clf = rand_search_cv.best_estimator_

In [27]:
y_pred = best_clf.predict(X_test)

In [28]:
roc_auc_score(y_test, y_pred)

0.9532977413732159

#### Проверяем на тестовых данных

In [29]:
events_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')
submission_data_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')

In [30]:
events_data_test = events_data_test.pivot_table(values='step_id',
                             index='user_id',
                             columns='action',
                             aggfunc='count').fillna(0).reset_index()

In [31]:
submission_data_test = submission_data_test.pivot_table(values='step_id',
                                 index='user_id',
                                 columns='submission_status',
                                 aggfunc='count').fillna(0).reset_index()

In [32]:
submission_data_test

submission_status,user_id,correct,wrong
0,12,1.0,0.0
1,13,29.0,36.0
2,15,10.0,30.0
3,21,24.0,103.0
4,35,7.0,35.0
...,...,...,...
2798,26775,46.0,160.0
2799,26780,16.0,7.0
2800,26785,3.0,1.0
2801,26796,2.0,3.0


In [33]:
test_data = events_data_test.merge(submission_data_test[['user_id', 'correct', 'wrong']], how='outer').fillna(0)

In [34]:
test_data = test_data.set_index('user_id')

In [35]:
test_data

Unnamed: 0_level_0,discovered,passed,started_attempt,viewed,correct,wrong
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,1.0,1.0,0.0,1.0,0.0,0.0
6,1.0,1.0,0.0,1.0,0.0,0.0
10,2.0,2.0,0.0,6.0,0.0,0.0
12,11.0,9.0,4.0,14.0,1.0,0.0
13,70.0,70.0,35.0,105.0,29.0,36.0
...,...,...,...,...,...,...
26791,1.0,1.0,0.0,1.0,0.0,0.0
26795,1.0,1.0,0.0,1.0,0.0,0.0
26796,6.0,4.0,2.0,12.0,2.0,3.0
26799,6.0,6.0,2.0,6.0,2.0,0.0


In [36]:
X

Unnamed: 0_level_0,discovered,passed,started_attempt,viewed,correct,wrong
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,0.0,0.0,1.0,0.0,0.0
2,9.0,9.0,2.0,9.0,2.0,0.0
3,15.0,15.0,4.0,20.0,29.0,23.0
5,1.0,1.0,0.0,1.0,2.0,2.0
7,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
26790,2.0,2.0,0.0,2.0,1.0,0.0
26793,1.0,0.0,1.0,1.0,0.0,0.0
26794,50.0,50.0,24.0,90.0,33.0,9.0
26797,10.0,10.0,2.0,10.0,2.0,0.0


In [37]:
y_predictions = pd.DataFrame(best_clf.predict_proba(test_data))

In [38]:
y_predictions[1]

0       0.000873
1       0.000873
2       0.000000
3       0.011006
4       1.000000
          ...   
6179    0.000873
6180    0.000873
6181    0.065064
6182    0.000668
6183    0.000000
Name: 1, Length: 6184, dtype: float64

In [39]:
users_id = test_data.reset_index().user_id

In [40]:
users_id

0           4
1           6
2          10
3          12
4          13
        ...  
6179    26791
6180    26795
6181    26796
6182    26799
6183    26800
Name: user_id, Length: 6184, dtype: int64

In [41]:
predictions = pd.concat([users_id, y_predictions[1]], axis=1).rename(columns={1: 'is_gone'})

In [42]:
predictions = predictions.set_index('user_id')

In [43]:
predictions.to_csv('my_predictions.csv')

Результат roc auc на провервке на тестовых данных (осуществлялась на сервере, данные о об успешности окончания курса не предоставлялись) – 0.85. 