In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [200]:
train = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('data/train_labels.csv')
test = pd.read_csv('data/test.csv')
specs = pd.read_csv('data/specs.csv')
sample = pd.read_csv('data/sample_submission.csv')

Source: https://www.kaggle.com/robikscube/2019-data-science-bowl-an-introduction
# train.csv / test.csv
The data provided in these files are as follows:
- `event_id` - Randomly generated unique identifier for the event type. Maps to event_id column in specs table.
- `game_session` - Randomly generated unique identifier grouping events within a single game or video play session.
- `timestamp` - Client-generated datetime
- `event_data` - Semi-structured JSON formatted string containing the events parameters. Default fields are: event_count, event_code, and game_time; otherwise - fields are determined by the event type.
- `installation_id` - Randomly generated unique identifier grouping game sessions within a single installed application instance.
- `event_count` - Incremental counter of events within a game session (offset at 1). Extracted from event_data.
- `event_code` - Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from event_data.
- `game_time` - Time in milliseconds since the start of the game session. Extracted from event_data.
- `title` - Title of the game or video.
- `type` - Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'.
- `world` - The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).

In [26]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [27]:
train.columns

Index(['event_id', 'game_session', 'timestamp', 'event_data',
       'installation_id', 'event_count', 'event_code', 'game_time', 'title',
       'type', 'world'],
      dtype='object')

In [28]:
train.shape

(11341042, 11)

Source: https://www.kaggle.com/erikbruin/data-science-bowl-2019-eda-and-baseline

So we have 11 million rows and just 11 columns. However, Kaggle provided the following note: Note that the training set contains many installation_ids which never took assessments, whereas every installation_id in the test set made an attempt on at least one assessment.

As there is no point in keeping training data that cannot be used for training anyway, I am getting rid of the installation_ids that never took an assessment

In [36]:
# filtering by ids that took assessments
ids_w_assessments = train[train['type'] == 'Assessment']['installation_id'].drop_duplicates()
train = train[train['installation_id'].isin(ids_w_assessments)]

In [277]:
# convert timestamp to correct datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format=True)

In [75]:
train.shape

(8294138, 11)

In [201]:
orig_train = train.copy()

# brad test

In [225]:
train = orig_train.copy()

In [226]:
train.columns

Index(['event_id', 'game_session', 'timestamp', 'event_data',
       'installation_id', 'event_count', 'event_code', 'game_time', 'title',
       'type', 'world'],
      dtype='object')

In [227]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [278]:
t1 = train[train['installation_id'] == '0006a69f']

In [279]:
t2 = t1[t1['title'] == 'Bird Measurer (Assessment)']

In [280]:
t2['world'].unique()

array(['TREETOPCITY'], dtype=object)

In [281]:
t3 = t2[t2['game_session'] == 'a9ef3ecb3d1acc6a']

In [282]:
t4 = t1[t1['game_session'] == 'a9ef3ecb3d1acc6a']

In [283]:
t3.shape

(32, 11)

In [284]:
t4.shape

(32, 11)

In [285]:
t5 = t1[t1['world'] == 'TREETOPCITY']

In [286]:
t1['world'].nunique()

3

In [287]:
t5.groupby(['game_session'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000020EF17A8C08>

In [382]:
condensed = pd.DataFrame(columns=t5.columns)

for key, group in t5.groupby(['game_session']):
    temp = pd.DataFrame(index=range(1,2), columns=t5.columns)
    
    unique_cols = ['event_id', 'game_session', 'installation_id', 'title', 'type', 'world']
    for col in unique_cols:
        temp[col] = group[col].unique()[0]
        
    last_val_cols = ['timestamp', 'event_count', 'game_time']
    for col in last_val_cols:
        temp[col] = group[col].iloc[-1]
    
    condensed = condensed.append(temp, ignore_index=True)

In [383]:
condensed.sort_values(by='timestamp', inplace=True)
condensed.reset_index(drop=True, inplace=True)

In [385]:
condensed['assessment_group'] = np.nan
list_assessement_groups = np.arange(len(condensed.loc[condensed['type'] == 'Assessment', 'assessment_group']))
condensed.loc[condensed['type'] == 'Assessment', 'assessment_group'] = list_assessement_groups

In [389]:
condensed['assessment_group'].fillna(method='backfill', inplace=True)

note: one of the values didn't merge

In [391]:
condensed[condensed['type'] == 'Assessment'].shape

(6, 12)

In [392]:
condensed.merge(train_labels, on=['game_session', 'installation_id', 'title']).shape

(5, 16)

In [393]:
cm = condensed.merge(train_labels, on=['game_session', 'installation_id', 'title'], how='left')

In [464]:
types = train['type'].unique() + '_counts'
totals = ['total_game_time', 'total_event_count']
flattened = pd.DataFrame(columns=list(cm.columns) + list(types) + totals)

for key, group in cm.groupby(['assessment_group']):
    temp = pd.DataFrame(index=range(1,2), columns=list(cm.columns) + list(types) + totals)
    #temp = group
    temp_counts = group['type'].value_counts()
    
    for col in group.columns:
        temp[col] = group[col].iloc[-1]
    
    for key, value in temp_counts.items():
        temp[key + '_counts'] = value
        
    temp['total_game_time'] = sum(group['game_time'])
    temp['total_event_count'] = sum(group['event_count'])
        
    flattened = flattened.append(temp, ignore_index=True)

### TODO: figure out what to do the with nan values in the counts and accuracy group columns

In [465]:
flattened

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,assessment_group,num_correct,num_incorrect,accuracy,accuracy_group,Clip_counts,Activity_counts,Game_counts,Assessment_counts,total_game_time,total_event_count
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:41.147000+00:00,,0006a69f,48,,39803,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,0.0,1.0,0.0,1.0,3.0,7.0,2.0,3.0,1,460761,458
1,f56e0afc,77b8ee947eb84b4e,2019-08-06 05:36:51.915000+00:00,,0006a69f,87,,92799,Bird Measurer (Assessment),Assessment,TREETOPCITY,1.0,0.0,11.0,0.0,0.0,3.0,1.0,2.0,1,469592,535
2,3bfd1a65,6bdf9623adc94d89,2019-08-06 05:38:16.835000+00:00,,0006a69f,35,,26827,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2.0,1.0,0.0,1.0,3.0,,,,1,26827,35
3,3bfd1a65,e7e7db2a241eadcc,2019-08-06 20:34:44.115000+00:00,,0006a69f,17,,8789,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3.0,,,,,6.0,2.0,1.0,1,1685943,400
4,3bfd1a65,9501794defd84e4d,2019-08-06 20:35:25.648000+00:00,,0006a69f,42,,31843,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,4.0,1.0,1.0,0.5,2.0,,,,1,31843,42
5,f56e0afc,a9ef3ecb3d1acc6a,2019-08-06 20:50:35.426000+00:00,,0006a69f,32,,36368,Bird Measurer (Assessment),Assessment,TREETOPCITY,5.0,1.0,0.0,1.0,3.0,4.0,1.0,3.0,1,493389,417


Try some ML!

In [466]:
data = flattened.copy()

In [468]:
data.drop(['event_id', 'game_session', 'timestamp', 'event_data', 'installation_id', 'event_code', 'type',
           'assessment_group'], axis=1, inplace=True)

In [469]:
data

Unnamed: 0,event_count,game_time,title,world,num_correct,num_incorrect,accuracy,accuracy_group,Clip_counts,Activity_counts,Game_counts,Assessment_counts,total_game_time,total_event_count
0,48,39803,Mushroom Sorter (Assessment),TREETOPCITY,1.0,0.0,1.0,3.0,7.0,2.0,3.0,1,460761,458
1,87,92799,Bird Measurer (Assessment),TREETOPCITY,0.0,11.0,0.0,0.0,3.0,1.0,2.0,1,469592,535
2,35,26827,Mushroom Sorter (Assessment),TREETOPCITY,1.0,0.0,1.0,3.0,,,,1,26827,35
3,17,8789,Mushroom Sorter (Assessment),TREETOPCITY,,,,,6.0,2.0,1.0,1,1685943,400
4,42,31843,Mushroom Sorter (Assessment),TREETOPCITY,1.0,1.0,0.5,2.0,,,,1,31843,42
5,32,36368,Bird Measurer (Assessment),TREETOPCITY,1.0,0.0,1.0,3.0,4.0,1.0,3.0,1,493389,417


should we just look at assessments?

should we expand and look at times and stack based on time

In [517]:
t5[t5['event_code'] == 2000]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
1775,27253bdc,d3ffed2d2d77597d,2019-08-06 05:07:33.207000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Tree Top City - Level 1,Clip,TREETOPCITY
1776,27253bdc,e882eb6760063bb1,2019-08-06 05:07:58.484000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Ordering Spheres,Clip,TREETOPCITY
1777,b7dc8128,75e40170dea0bd21,2019-08-06 05:09:10.506000+00:00,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",0006a69f,1,2000,0,All Star Sorting,Game,TREETOPCITY
1805,b7dc8128,55d1292018d7f56d,2019-08-06 05:09:46.612000+00:00,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",0006a69f,1,2000,0,All Star Sorting,Game,TREETOPCITY
1840,b7dc8128,3422611c17545edd,2019-08-06 05:10:26.871000+00:00,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",0006a69f,1,2000,0,All Star Sorting,Game,TREETOPCITY
1930,27253bdc,83f2f92ca8c618a7,2019-08-06 05:12:32.292000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Costume Box,Clip,TREETOPCITY
1931,27253bdc,1dd51df3dd28e7e0,2019-08-06 05:13:40.144000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Costume Box,Clip,TREETOPCITY
1932,4901243f,05921a6eb858eeba,2019-08-06 05:14:51.391000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Fireworks (Activity),Activity,TREETOPCITY
2016,27253bdc,6806a11d2985d2f8,2019-08-06 05:16:43.526000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,12 Monkeys,Clip,TREETOPCITY
2017,27253bdc,7fb287ace174adee,2019-08-06 05:17:35.568000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Tree Top City - Level 2,Clip,TREETOPCITY


## Train Labels

You can get a unique set of training labels by combining `game_session` and `installation_id`.

Labels are only on assessments.

In [255]:
train_labels.shape

(17690, 7)

In [254]:
# Do I need to create a mapping? I don't think so since the merge worked

tls = list(map(lambda x, y: str(x) + '_' + str(y), train_labels['game_session'], train_labels['installation_id']))
len(set(tls))

17690

In [228]:
specs = pd.read_csv('data/specs.csv')

In [229]:
specs

Unnamed: 0,event_id,info,args
0,2b9272f4,The end of system-initiated feedback (Correct)...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
1,df4fe8b6,The end of system-initiated feedback (Incorrec...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
2,3babcb9b,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
3,7f0836bf,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
4,ab3136ba,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
...,...,...,...
381,29f54413,The start round event is triggered at the star...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
382,06372577,The start tutorial event is triggered at the s...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
383,2a444e03,This event occurs when the player picks up a w...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
384,9e6b7fb5,This event occurs when the player clicks on th...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."


In [217]:
# https://www.kaggle.com/manyregression/fastai-2019-data-science-bowl
# do we need this?
'''
specs['hashed_info']=specs['info'].transform(hash)
unique_specs=pd.DataFrame(specs[['hashed_info']].drop_duplicates())
unique_specs["id"] = np.arange(len(unique_specs))
specs = pd.merge(specs,unique_specs,on='hashed_info',how='left')
event_id_mapping = dict(zip(specs.event_id,specs.id))
train["event_id"] = train["event_id"].map(event_id_mapping)'''
#raw_test["event_id"] = raw_test["event_id"].map(event_id_mapping)

# end brad test

In [None]:
from typing import Any
import re

def add_datepart(df: pd.DataFrame, field_name: str,
                 prefix: str = None, drop: bool = True, time: bool = True, date: bool = True):
    """
    Helper function that adds columns relevant to a date in the column `field_name` of `df`.
    from fastai: https://github.com/fastai/fastai/blob/master/fastai/tabular/transform.py#L55
    """
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Is_month_end', 'Is_month_start']
    if date:
        attr.append('Date')
    if time:
        attr = attr + ['Hour', 'Minute']
    for n in attr:
        df[prefix + n] = getattr(field.dt, n.lower())
    if drop:
        df.drop(field_name, axis=1, inplace=True)
    return df

def ifnone(a: Any, b: Any) -> Any:
    """`a` if `a` is not None, otherwise `b`.
    from fastai: https://github.com/fastai/fastai/blob/master/fastai/core.py#L92"""
    return b if a is None else a

# test area for looking at events by day

In [135]:
train = orig_train.copy()

In [136]:
train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format=True)

In [138]:
train = add_datepart(train, "timestamp", prefix="timestamp_", time=True)

In [139]:
train.columns

Index(['event_id', 'game_session', 'event_data', 'installation_id',
       'event_count', 'event_code', 'game_time', 'title', 'type', 'world',
       'timestamp_Year', 'timestamp_Month', 'timestamp_Week', 'timestamp_Day',
       'timestamp_Dayofweek', 'timestamp_Is_month_end',
       'timestamp_Is_month_start', 'timestamp_Date', 'timestamp_Hour',
       'timestamp_Minute'],
      dtype='object')

In [140]:
train.head()

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,timestamp_Year,timestamp_Month,timestamp_Week,timestamp_Day,timestamp_Dayofweek,timestamp_Is_month_end,timestamp_Is_month_start,timestamp_Date,timestamp_Hour,timestamp_Minute
1538,27253bdc,34ba1a28d02ba8ba,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,2019,8,32,6,1,False,False,2019-08-06,4,57
1539,27253bdc,4b57c9a59474a1b9,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,57
1540,77261ab5,2b9d5af79bcdb79f,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,58
1541,b2dba42b,2b9d5af79bcdb79f,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,58
1542,1325467d,2b9d5af79bcdb79f,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,58


In [163]:
train[train['type'] == 'Assessment'].head()

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,timestamp_Year,timestamp_Month,timestamp_Week,timestamp_Day,timestamp_Dayofweek,timestamp_Is_month_end,timestamp_Is_month_start,timestamp_Date,timestamp_Hour,timestamp_Minute
2185,3bfd1a65,901acc108f55a5a1,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2019,8,32,6,1,False,False,2019-08-06,5,22
2186,db02c830,901acc108f55a5a1,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2019,8,32,6,1,False,False,2019-08-06,5,22
2187,a1e4395d,901acc108f55a5a1,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2019,8,32,6,1,False,False,2019-08-06,5,22
2188,a52b92d5,901acc108f55a5a1,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2019,8,32,6,1,False,False,2019-08-06,5,22
2189,a1e4395d,901acc108f55a5a1,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,2019,8,32,6,1,False,False,2019-08-06,5,22


In [171]:
#t1 = train[train['event_id'] == '3bfd1a65']
#t1 = t1[t1['installation_id'] == '0006a69f']
t1 = train[train['installation_id'] == '0006a69f']

In [172]:
t1.shape

(3801, 20)

In [173]:
t1

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,timestamp_Year,timestamp_Month,timestamp_Week,timestamp_Day,timestamp_Dayofweek,timestamp_Is_month_end,timestamp_Is_month_start,timestamp_Date,timestamp_Hour,timestamp_Minute
1538,27253bdc,34ba1a28d02ba8ba,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,2019,8,32,6,1,False,False,2019-08-06,4,57
1539,27253bdc,4b57c9a59474a1b9,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,57
1540,77261ab5,2b9d5af79bcdb79f,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,58
1541,b2dba42b,2b9d5af79bcdb79f,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,58
1542,1325467d,2b9d5af79bcdb79f,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,32,6,1,False,False,2019-08-06,4,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5334,832735e1,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,11,3010,23145,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,33
5335,ab3136ba,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,12,3110,26804,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,33
5336,832735e1,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,13,3010,41805,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,34
5337,ab3136ba,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,14,3110,45405,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,34


In [184]:
for key, group in t1.groupby(['timestamp_Month', 'timestamp_Day', 'timestamp_Year']):
#for key, group in t1.groupby(['title']):
    print(key)
    print(group.shape)

(8, 6, 2019)
(2719, 20)
(8, 9, 2019)
(442, 20)
(8, 29, 2019)
(640, 20)


In [185]:
key

(8, 29, 2019)

In [186]:
t2 = group.copy()

In [187]:
t2

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,timestamp_Year,timestamp_Month,timestamp_Week,timestamp_Day,timestamp_Dayofweek,timestamp_Is_month_end,timestamp_Is_month_start,timestamp_Date,timestamp_Hour,timestamp_Minute
4699,77261ab5,d37c0b021d22c3ac,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,14
4700,b2dba42b,d37c0b021d22c3ac,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,34,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,14
4701,1325467d,d37c0b021d22c3ac,"{""coordinates"":{""x"":244,""y"":581,""stage_width"":...",0006a69f,3,4070,2298,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,14
4702,1325467d,d37c0b021d22c3ac,"{""coordinates"":{""x"":843,""y"":241,""stage_width"":...",0006a69f,4,4070,3482,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,14
4703,1325467d,d37c0b021d22c3ac,"{""coordinates"":{""x"":806,""y"":616,""stage_width"":...",0006a69f,5,4070,4115,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5334,832735e1,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,11,3010,23145,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,33
5335,ab3136ba,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,12,3110,26804,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,33
5336,832735e1,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,13,3010,41805,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,34
5337,ab3136ba,0721802df0531701,"{""description"":""Let's warm up. Tap a dinosaur ...",0006a69f,14,3110,45405,Dino Dive,Game,MAGMAPEAK,2019,8,35,29,3,False,False,2019-08-29,16,34


In [179]:
t2[t2['type'] == 'Assessment']

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,timestamp_Year,timestamp_Month,timestamp_Week,timestamp_Day,timestamp_Dayofweek,timestamp_Is_month_end,timestamp_Is_month_start,timestamp_Date,timestamp_Hour,timestamp_Minute


In [160]:
t2.loc[5080, 'event_data']

'{"description":"To play again, tap here!","identifier":"Mom_ToPlayAgainTouch","media_type":"audio","total_duration":1913,"round":2,"event_count":122,"game_time":144671,"event_code":3010}'

# end test area

In [None]:
# https://www.kaggle.com/robikscube/2019-data-science-bowl-an-introduction

train['cleared'] = True
train.loc[train['event_data'].str.contains('false') & train['event_code'].isin([4100, 4110]), 'cleared'] = False

test['cleared'] = True
test.loc[test['event_data'].str.contains('false') & test['event_code'].isin([4100, 4110]), 'cleared'] = False

Some paraphrased notes from: https://www.kaggle.com/manyregression/fastai-2019-data-science-bowl

The intent of the competition is to use the gameplay data to forecast how many attempts a child will take to pass a given assessment.

For each installation_id represented in the test set, you must predict the accuracy_group **of the last assessment** for that installation_id

* Assessment attempts are captured in event_code 4100 for all assessments except for Bird Measurer, which uses event_code 4110.
* Each application install is represented by an installation_id. This will typically correspond to one child, but you should expect noise from issues such as shared devices.
* In the training set, you are provided **the full history of gameplay data.**
* In the test set, **we have truncated the history after the start event of a single assessment, chosen randomly, for which you must predict the number of attempts.**

In [76]:
specs['hashed_info'] = specs['info'].transform(hash)
unique_specs = specs[['hashed_info']].drop_duplicates()
unique_specs["id"] = np.arange(len(unique_specs))
specs = pd.merge(specs,unique_specs, on='hashed_info', how='left')
event_id_mapping = dict(zip(specs.event_id, specs.id))

In [96]:
def get_accuracy(correct_data):
    # Rounding correct > 1 to 1 lowers the score. Why?
    correct = len(correct_data.loc[correct_data])
    wrong = len(correct_data.loc[~correct_data])
    accuracy = correct/(correct + wrong) if correct + wrong else 0
    return accuracy, correct, wrong

def get_group(accuracy):
    if not accuracy:
        return 0
    elif accuracy == 1:
        return 3
    elif accuracy >= 0.5:
        return 2
    return 1

# begin lyons test

In [193]:
train = orig_train.copy()

In [194]:
### encode title lyons v2

train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
# make a list with all the unique 'titles' from the train and test set
list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
# make a list with all the unique 'event_code' from the train and test set
list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
# make a list with all the unique worlds from the train and test set
list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
# create a dictionary numerating the titles
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
# replace the text titles with the number titles from the dict
train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train['world'] = train['world'].map(activities_world)
test['world'] = test['world'].map(activities_world)
train_labels['title'] = train_labels['title'].map(activities_map)
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
# then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
win_code[activities_map['Bird Measurer (Assessment)']] = 4110
# convert text into datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:
def truncate_training(train):
    
    compiled_data = []
    
    # Loop through installation ids
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
        
        user_df = user_sample.reset_index(drop=True)
        
        # Get an array of the sessions with assessments
        assessment_sessions = user_df[user_df['type'] == 'Assessment']['game_session'].unique()
        
        # Pick a random session
        assessment_to_truncate = random.choice(assessment_sessions)
        
        # Grab the row location for that session's event code 2000
        assessment_to_truncate_start_index = user_df.index[(user_df['game_session'] == assessment_to_truncate) & 
                                                    (user_df['event_code'] == 2000)][0]
        
        # Now get a slice of the user_sample from the beginning to the truncated assessment start
        truncated_user_sample = user_df.iloc[:assessment_to_truncate_start_index + 1]
        
        compiled_data.append(truncated_user_sample)
        
    return pd.concat(compiled_data)

In [None]:
# truncate train to match test
# train_truncated = truncate_training(train)

In [195]:
train

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,title_event_code
1538,27253bdc,34ba1a28d02ba8ba,2019-08-06 04:57:18.904000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,72,Clip,5,Welcome to Lost Lagoon!_2000
1539,27253bdc,4b57c9a59474a1b9,2019-08-06 04:57:45.301000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,78,Clip,6,Magma Peak - Level 1_2000
1540,77261ab5,2b9d5af79bcdb79f,2019-08-06 04:58:14.538000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,9,Activity,6,Sandcastle Builder (Activity)_2000
1541,b2dba42b,2b9d5af79bcdb79f,2019-08-06 04:58:14.615000+00:00,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,9,Activity,6,Sandcastle Builder (Activity)_3010
1542,1325467d,2b9d5af79bcdb79f,2019-08-06 04:58:16.680000+00:00,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,9,Activity,6,Sandcastle Builder (Activity)_4070
...,...,...,...,...,...,...,...,...,...,...,...,...
11337821,28520915,5448d652309a6324,2019-09-22 02:07:27.562000+00:00,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,64,Assessment,6,Cauldron Filler (Assessment)_2030
11337822,91561152,5448d652309a6324,2019-09-22 02:07:27.562000+00:00,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,64,Assessment,6,Cauldron Filler (Assessment)_4025
11337823,d3268efa,5448d652309a6324,2019-09-22 02:07:27.566000+00:00,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,64,Assessment,6,Cauldron Filler (Assessment)_3021
11337824,b5053438,5448d652309a6324,2019-09-22 02:07:28.311000+00:00,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,64,Assessment,6,Cauldron Filler (Assessment)_3121


In [197]:
def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort = False)), total = 17000):
        #compiled_train += get_data(user_sample, test_set = False)
        train_data = get_data(user_sample, test_set = True)
        compiled_train.append(train_data)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total = 1000):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    categoricals = ['session_title']
    return reduce_train, reduce_test, categoricals

In [198]:
reduce_train, reduce_test, categoricals = get_train_and_test(train, test)

  0%|                                                                                        | 0/17000 [00:01<?, ?it/s]


NameError: name 'get_data' is not defined

# end lyons test

In [486]:
tr1 = train[train['installation_id'] == '0001e90f']

In [510]:
tr2 = tr1[tr1['event_code'] == 2000]

In [511]:
tr2

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06 17:53:46.937000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06 17:54:17.519000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06 17:54:56.302000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
269,77261ab5,a1ec58f109218255,2019-09-06 17:58:28.042000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
362,6d90d394,f11eb823348bfa23,2019-09-06 18:00:51.915000+00:00,"{""version"":""1.0"",""level"":0,""round"":0,""event_co...",0001e90f,1,2000,0,Scrub-A-Dub,Game,MAGMAPEAK
1103,27253bdc,091587ed87b9637c,2019-09-06 18:10:51.183000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
1104,51311d7a,07bacda7f9437b38,2019-09-06 18:11:28.381000+00:00,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",0001e90f,1,2000,0,Dino Drink,Game,MAGMAPEAK
1246,77261ab5,ca8b415f34d12873,2019-09-06 18:16:25.132000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
1355,27253bdc,3fce4f09769ff0b7,2019-09-06 18:18:11.250000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Tree Top City - Level 1,Clip,TREETOPCITY
1356,27253bdc,04168b61c0650977,2019-09-06 18:18:42.392000+00:00,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Ordering Spheres,Clip,TREETOPCITY


In [491]:
st = specs[specs['event_id'] == '27253bdc']['info']

In [496]:
st.loc[343]

'This event is triggered when a video starts playing'

In [471]:
sample.shape

(1000, 2)

In [477]:
train[train['installation_id'].isin(sample['installation_id'])].shape

(0, 11)

In [476]:
test[test['installation_id'].isin(sample['installation_id'])].shape

(1156414, 11)

In [479]:
test['installation_id'].nunique()

1000

In [506]:
test['installation_id'].unique()

array(['00abaee7', '01242218', '017c5718', '01a44906', '01bc6cb6',
       '02256298', '0267757a', '027e7ce5', '02a29f99', '0300c576',
       '03885368', '03ac279b', '03e33699', '048e7427', '04a7bc3f',
       '04d31500', '0500e23b', '0512bf0e', '0525589b', '05488e26',
       '05771bba', '05b82cf5', '05e17e19', '0617500d', '068ae11f',
       '0754f13b', '07749e99', '08611cc8', '08671ec7', '0889b0ae',
       '090fe325', '0937340d', '09aaaf83', '09aefe80', '0a126293',
       '0a2a77b2', '0a4c0f78', '0af94ba5', '0b24b6ac', '0b607c82',
       '0d5735f2', '0d735146', '0d7752d3', '0dd670e9', '0de6863d',
       '0e514571', '0e718764', '0ea27b66', '0f584054', '0f7116a6',
       '101999d8', '101d16f5', '108044a0', '109ad724', '10acf963',
       '1121f331', '1181ce7c', '11fa34d0', '125a3d09', '12771ee9',
       '1294d68e', '12bcbbce', '13629687', '138a2ecc', '13a0754c',
       '13bcaf23', '13cf3fc0', '13d608cb', '140087ce', '140ea7a3',
       '1423dc8f', '14cdc97f', '153f087c', '1594c19e', '15d869

In [520]:
te1 = test[test['installation_id'] == '017c5718']

In [521]:
te1[te1['type'] == 'Assessment'].shape

(1, 11)

In [522]:
te1

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
3587,27253bdc,d1706431c69d0f17,2019-08-02T23:24:03.145Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3588,27253bdc,7f8e671b050cfc16,2019-09-21T11:23:14.319Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3589,27253bdc,9cbc7871cb68348e,2019-09-21T11:23:49.822Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3590,27253bdc,dbe0b9903177b7ab,2019-09-21T11:24:14.904Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
3591,27253bdc,bada8e54f3bb8b3e,2019-09-21T11:24:34.545Z,"{""event_code"": 2000, ""event_count"": 1}",017c5718,1,2000,0,Tree Top City - Level 1,Clip,TREETOPCITY
...,...,...,...,...,...,...,...,...,...,...,...
3732,fcfdffb6,bb342a8258a1cd81,2019-09-21T11:28:02.171Z,"{""flower"":4,""flowers"":[0,0,1,0,5],""coordinates...",017c5718,68,4022,51573,Flower Waterer (Activity),Activity,TREETOPCITY
3733,5d042115,bb342a8258a1cd81,2019-09-21T11:28:02.230Z,"{""flowers"":[0,0,0,3,5],""flower"":5,""coordinates...",017c5718,69,4030,51623,Flower Waterer (Activity),Activity,TREETOPCITY
3734,fcfdffb6,bb342a8258a1cd81,2019-09-21T11:28:02.407Z,"{""flower"":5,""flowers"":[0,0,0,1,0],""coordinates...",017c5718,70,4022,51807,Flower Waterer (Activity),Activity,TREETOPCITY
3735,a44b10dc,bb342a8258a1cd81,2019-09-21T11:28:02.516Z,"{""coordinates"":{""x"":804,""y"":358,""stage_width"":...",017c5718,71,4070,51907,Flower Waterer (Activity),Activity,TREETOPCITY


In [523]:
te1[te1['event_code'] == 2000].shape

(9, 11)

In [505]:
sample[sample['installation_id'] == 'ecaab346']

Unnamed: 0,installation_id,accuracy_group


In [504]:
specs[specs['event_id'] == 'b74258a0']['info'].iloc[0]

'The beat round event is triggered when the player finishes a round by successfully balancing the carts and submitting the solution. This event is used for calculating time spent in a round (for speed and accuracy), attempts at solving a round, and the number of rounds the player has completed (completion).'