In [46]:
import pandas as pd
import numpy as np

from datetime import datetime

import plotly.express as px

# Data preprocessing

In [47]:
events_data = pd.read_csv("event_data_train.csv")
submissions_data = pd.read_csv("submissions_data_train.csv")

In [48]:
events_data.head()

Unnamed: 0,step_id,timestamp,action,user_id
0,32815,1434340848,viewed,17632
1,32815,1434340848,passed,17632
2,32815,1434340848,discovered,17632
3,32811,1434340895,discovered,17632
4,32811,1434340895,viewed,17632


In [49]:
submissions_data.head()

Unnamed: 0,step_id,timestamp,submission_status,user_id
0,31971,1434349275,correct,15853
1,31972,1434348300,correct,15853
2,31972,1478852149,wrong,15853
3,31972,1478852164,correct,15853
4,31976,1434348123,wrong,15853


In [50]:
# Date
events_data["date"] = pd.to_datetime(events_data.timestamp, unit="s")
events_data["day"] = events_data.date.dt.date

submissions_data["date"] = pd.to_datetime(submissions_data.timestamp, unit="s")
submissions_data["day"] = submissions_data.date.dt.date

# Correct and wrong
users_scores = submissions_data.pivot_table(index="user_id",
                                            columns="submission_status",
                                            values="step_id",
                                            aggfunc="count",  # fill_value - Nan => 0
                                            fill_value=0).reset_index().rename(columns={"sumbmission_status": "index"})


# Last timestamp
users_data = events_data.groupby("user_id", as_index=False).agg(
    {"timestamp": "max"}).rename(columns={"timestamp": "last_timestamp"})


# Is gone user
now = 1526772811
drop_out_threshold = 30 * 24 * 60 * 60
users_data["is_gone_user"] = (
    now - users_data.last_timestamp) > drop_out_threshold


# users_data + users_scores
users_data = users_data.merge(
    users_scores, on="user_id", how="outer")  # To study how "how" works
users_data = users_data.fillna(0)
users_data.head()

# actions: passed, viewed...
users_events_data = pd.pivot_table(events_data,
                                   index="user_id",
                                   columns="action",
                                   values="step_id",
                                   aggfunc="count").fillna(0).reset_index()

# users_data + users_events_data
users_data = pd.merge(users_data, users_events_data, on="user_id", how="outer")

# The number of unique days.
users_days = events_data.groupby("user_id").agg({"day": "nunique"})
users_data = users_data.merge(users_days, on="user_id", how="outer")
users_data = users_data.rename(columns={"day": "days"})

# Whether the user has passed the course
users_data["is_passed_course"] = users_data.passed > 175


# Result
users_data.head()


Unnamed: 0,user_id,last_timestamp,is_gone_user,correct,wrong,discovered,passed,started_attempt,viewed,days,is_passed_course
0,1,1472827464,True,0.0,0.0,1.0,0.0,0.0,1.0,1,False
1,2,1519226966,True,2.0,0.0,9.0,9.0,2.0,10.0,2,False
2,3,1444581588,True,29.0,23.0,91.0,87.0,30.0,192.0,7,False
3,5,1499859939,True,2.0,2.0,11.0,11.0,4.0,12.0,2,False
4,7,1521634660,True,0.0,0.0,1.0,1.0,0.0,1.0,1,False


Check that the data is not lost.

In [51]:
events_data.user_id.nunique() == users_data.user_id.nunique()

True

## Data preprocessing for ML. 

In [52]:

users_min_time = events_data.groupby("user_id", as_index=False).agg(
    {"timestamp": "min"}).rename({"timestamp": "min_timestamp"}, axis=1)

events_data_train = events_data.merge(users_min_time, on="user_id", how="outer")

events_data_train.query("min_timestamp <= timestamp <= (min_timestamp + 3 * 24 * 60 * 60)").head()

Unnamed: 0,step_id,timestamp,action,user_id,date,day,min_timestamp
0,32815,1434340848,viewed,17632,2015-06-15 04:00:48,2015-06-15,1434340848
1,32815,1434340848,passed,17632,2015-06-15 04:00:48,2015-06-15,1434340848
2,32815,1434340848,discovered,17632,2015-06-15 04:00:48,2015-06-15,1434340848
3,32811,1434340895,discovered,17632,2015-06-15 04:01:35,2015-06-15,1434340848
4,32811,1434340895,viewed,17632,2015-06-15 04:01:35,2015-06-15,1434340848
