In [1]:
import pandas as pd
import numpy as np
import os
import time
import copy
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold
import gc
import json
import lightgbm as lgb
import seaborn as sns
from functools import partial
import xgboost as xgb
import scipy as sp
from numba import jit
pd.set_option('display.max_columns', 1000)
import matplotlib.pyplot as plt
import random
kaggle=True
if kaggle:
    dirs="/kaggle/input/data-science-bowl-2019/"
else:
    dirs="./"

In [2]:
import random
import os
def SeedEverything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    return
SeedEverything(1993)

In [3]:
DEBUG = False

In [4]:
def read_data():
    if DEBUG:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv', nrows=100000)
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv', nrows=100000)
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    else:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv')
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv')
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv(dirs+'train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv(dirs+'specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv(dirs+'sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    
    result=[]
    for event_id,df in train.groupby('event_id'):
        if event_id=='27253bdc':
            result.append({'event_id':event_id,'event_code':df['event_code'].iloc[0],'title':"ALL Clips",'type':"Clip",'world':"ALL worlds"})
        else:
            result.append({'event_id':event_id,'event_code':df['event_code'].iloc[0],'title':df['title'].iloc[0],'type':df['type'].iloc[0],'world':df['world'].iloc[0]})
    title_code=pd.DataFrame(result)
    specs=pd.merge(specs,title_code,how='outer',right_on='event_id',left_on='event_id')   
    return train, test, train_labels, specs, sample_submission

train, test, train_labels, specs, sample_submission = read_data()

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns
Reading specs.csv file....
Specs.csv file have 386 rows and 3 columns
Reading sample_submission.csv file....
Sample_submission.csv file have 1000 rows and 2 columns


In [5]:
import pandas as pd
media_sequence = pd.read_csv("../input/dsb2019-external-data/media_sequence.csv")

In [6]:
len(media_sequence)

44

In [7]:
train = train[train.installation_id.isin(train_labels.installation_id.unique())].reset_index(drop=True)
train.shape
specs2 = pd.read_csv(dirs+'specs.csv')

list_of_event_args = list(set(specs2['args'].unique()))
event_args_map = dict(zip(list_of_event_args, np.arange(len(list_of_event_args))))
specs2["args"]=specs2["args"].map(event_args_map)

list_of_event_info = list(set(specs2['info'].unique()))
event_info_map = dict(zip(list_of_event_info, np.arange(len(list_of_event_info))))
specs2["info"]=specs2["info"].map(event_info_map)

args_list=specs2["args"].value_counts().add_prefix('args_').index.tolist()
args_label=dict(zip(np.arange(len(args_list)), args_list))
info_list=specs2["info"].value_counts().add_prefix('info_').index.tolist()
info_label=dict(zip(np.arange(len(info_list)), info_list))

train=pd.merge(train,specs2,on=["event_id"],how="left")
test=pd.merge(test,specs2,on=["event_id"],how="left")

In [8]:
def encode_title(train, test):
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title']))
    list_of_event_code = list(set(train['event_code']))
    list_of_event_id = list(set(train['event_id']))
    list_of_worlds = list(set(train['world']))
    
    list_of_user_activities=sorted(list_of_user_activities)
    list_of_event_code=sorted(list_of_event_code)
    list_of_event_id=sorted(list_of_event_id)
    list_of_worlds=sorted(list_of_worlds)
    # create a dictionary numerating the titles
    title_enc= dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    world_enc = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    assess_titles = sorted(assess_titles)
    
    win_code = dict(zip(list_of_user_activities, (4100*np.ones(len(list_of_user_activities))).astype('int')))
    win_code['Bird Measurer (Assessment)'] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    return train, test,win_code, list_of_user_activities, list_of_event_code, list_of_event_id,list_of_worlds,title_enc,world_enc,assess_titles

train, test,win_code, list_of_user_activities, list_of_event_code, list_of_event_id,list_of_worlds,title_enc,world_enc,assess_titles=encode_title(train,test)

| code | argument | method_in_session | method_between_session | 备注 | 含义 |  
| --- | ---| --- | ---| --- | --- |
|2030|misses|ema|ema|minmaxscale|beatround
|2030|round|max|ema|minmaxscale|
|2030|duration|ema|ema|minmaxscale|
|4020/4025|correct|count| ema| norm|一次操作
|4020/4025|round|max|ema|norm|
|4100/4110|correct|count|
|4080|duration|ema|ema|norm|
|4040|duration|ema|ema|norm|一次拖动

dict:  event_id:[argument1,argument2,....]

In [9]:
code=[2030,2030,2030,(4020,4025),(4020,4025),(4100,4110),(4040)]
argument=['"misses"','"round"','"duration"','"correct"','"round"','"correct"','"duration"','"duration"']
method_in_session=['ema','ema','ema','count_true','count']

interesting_args=[
    [2030,'"misses"',"ema","ema"],
    [2030,'"round"',"max","ema"],
    [2030,'"duration"',"ema","ema"],
    [(4020,4025),'"correct"',"count_true","ema"],
    [(4020,4025),'"correct"',"count_false","ema"],
    [(4020,4025),'"round"',"max","ema"],
    [(4100,4110),'"correct"',"count_true","ema"],
    [(4100,4110),'"correct"',"count_false","ema"],
    [4040,'"duration"',"ema","ema"]
]




useful_codes={2030,4020,4025,4040,4100,4110}
def get_event_data_dict(specs,interesting_args,useful_codes):
    event_id_to_args={}
    key_set=set()
    for i in range(len(specs)):
        args_event_id=[]
        row=specs.iloc[i]
        event_code=row['event_code']
        event_id=row['event_id']
        title=row['title']
        if event_code not in useful_codes:
            continue
        collect_args=[]
        for arg in interesting_args:
            code_match=event_code in arg[0] if type(arg[0])==tuple else event_code==arg[0]
            if code_match and arg[1] in row['args']:
                key=title+'_'.join([str(c) for c in arg[0]]) if type(arg[0])==tuple else title+'_'+str(arg[0])
                key+='_'+arg[1][1:-1]+'_'+arg[2]+'_'+arg[3]
                collect_args.append([arg[1][1:-1],key,arg[2],arg[3]])
                #print("ID {} code {} add {} to key {}, methods {} {}".format(event_id,event_code,arg[1][1:-1],key,arg[2],arg[3]))
                key_set.add(key)
            else:
                pass
                #print("ID {} code {} does not have {}".format(event_id,event_code,arg[1][1:-1]))
        if collect_args:
            event_id_to_args[event_id]=collect_args
    return event_id_to_args,key_set
event_id_to_args,key_set=get_event_data_dict(specs,interesting_args,useful_codes)
print(len(key_set))
key_set

106


{'Air Show4020_4025_correct_count_false_ema',
 'Air Show4020_4025_correct_count_true_ema',
 'Air Show4020_4025_round_max_ema',
 'Air Show4100_4110_correct_count_false_ema',
 'Air Show4100_4110_correct_count_true_ema',
 'Air Show_2030_duration_ema_ema',
 'Air Show_2030_misses_ema_ema',
 'Air Show_2030_round_max_ema',
 'All Star Sorting4020_4025_correct_count_false_ema',
 'All Star Sorting4020_4025_correct_count_true_ema',
 'All Star Sorting4020_4025_round_max_ema',
 'All Star Sorting_2030_duration_ema_ema',
 'All Star Sorting_2030_misses_ema_ema',
 'All Star Sorting_2030_round_max_ema',
 'Bird Measurer (Assessment)4020_4025_correct_count_false_ema',
 'Bird Measurer (Assessment)4020_4025_correct_count_true_ema',
 'Bird Measurer (Assessment)4100_4110_correct_count_false_ema',
 'Bird Measurer (Assessment)4100_4110_correct_count_true_ema',
 'Bird Measurer (Assessment)_2030_duration_ema_ema',
 'Bird Measurer (Assessment)_2030_misses_ema_ema',
 'Bird Measurer (Assessment)_4040_duration_ema_em

In [10]:
ema_momentum_in_session=0.75
ema_momentum_between_session=0.75
class data_logger(object):
    def __init__(self,event_id_to_args):
        self.event_id_to_args=event_id_to_args
        self.installation_status={}
    
    def log_data(self,session):
        session_status={}
        update_method={}
        do_count_ids=set()
        for event_id,data_str in zip(session['event_id'],session['event_data']): 
            args=self.event_id_to_args.get(event_id)
            if args is None:
                continue
            event_data=json.loads(data_str)
            for arg_pair in args:
                if arg_pair[2]=='count_true' or arg_pair[2]=='count_false':
                    do_count_ids.add(event_id)
                    continue
                new_val=event_data[arg_pair[0]]
                if arg_pair[1] not in session_status:
                    if arg_pair[2]=='mean':
                        session_status[arg_pair[1]]=[]
                    else:
                        session_status[arg_pair[1]]=new_val
                else:
                    if arg_pair[2]=="ema":
                        session_status[arg_pair[1]]=ema_momentum_in_session*session_status[arg_pair[1]]+(1-ema_momentum_in_session)*new_val
                    elif arg_pair[2]=='max':
                        session_status[arg_pair[1]]=max(session_status[arg_pair[1]],new_val)
                    elif arg_pair[2]=='min':
                        session_status[arg_pair[1]]=min(session_status[arg_pair[1]],new_val)
                    elif arg_pair[2]=='sum':
                        session_status[arg_pair[1]]=session_status[arg_pair[1]]+new_val
                    elif arg_pair[2]=='mean':
                        session_status[arg_pair[1]].append(new_val)
                    else:
                        raise NotImplementedError
                        
                update_method[arg_pair[1]]=arg_pair[3]  #inter session method
        #count true false
        for count_id in do_count_ids:
            target_df=session[session.event_id==count_id]
            args=self.event_id_to_args.get(count_id)
            num_true=0
            num_false=0
            save_key=""
            for arg_pair in args:
                if arg_pair[2]=='count_true':
                    num_true=target_df['event_data'].str.contains('true').sum()
                    session_status[arg_pair[1]]=num_true
                    update_method[arg_pair[1]]=arg_pair[3]
                    save_key=arg_pair[1]
                elif arg_pair[2]=='count_false':
                    num_false=target_df['event_data'].str.contains('false').sum()    
                    session_status[arg_pair[1]]=num_false
                    update_method[arg_pair[1]]=arg_pair[3]
            session_status['_'.join(save_key.split("_")[:-2]+['accuracy'])]=num_true/(num_false+num_true)
            update_method['_'.join(save_key.split("_")[:-2]+['accuracy'])]="ema"
            
        #update installation_status
        for key in session_status.keys():
            if type(session_status[key]) is list:
                session_status[key]=np.mean(session_status[key])
            if key not in self.installation_status:
                if update_method[key]=='mean':
                    self.installation_status[key]=[]
                else:
                    self.installation_status[key]=session_status[key]
            else:    
                if update_method[key]=='ema':
                    self.installation_status[key]=ema_momentum_between_session*self.installation_status[key]+\
                                                    (1-ema_momentum_between_session)*session_status[key]
                elif update_method[key]=='max':
                    self.installation_status[key]=max(self.installation_status[key],session_status[key])
                elif update_method[key]=='min':
                    self.installation_status[key]=min(self.installation_status[key],session_status[key])
                elif update_method[key]=='sum':
                    self.installation_status[key]=self.installation_status[key]+session_status[key]
                elif update_method[key]=='mean':
                    self.installation_status[key].append(session_status[key])
                else:
                    raise NotImplementedError
             
    def get_data(self):
        
        return {key: self.installation_status[key] if type(self.installation_status[key]) is not list else np.mean(self.installation_status[key]) \
                for key in self.installation_status}

In [11]:
tree_top_city = {
    "Tree Top City - Level 1": 1,
    "Ordering Spheres": 2,
    "All Star Sorting": 3,
    "Costume Box": 4,
    "Fireworks (Activity)": 5,
    "12 Monkeys": 6,
    "Tree Top City - Level 2": 7,
    "Flower Waterer (Activity)": 8,
    "Pirate's Tale": 9,
    "Mushroom Sorter (Assessment)": 10,
    "Air Show": 11,
    "Treasure Map": 12,
    "Tree Top City - Level 3": 13,
    "Crystals Rule": 14,
    "Rulers": 15,
    "Bug Measurer (Activity)": 16,
    "Bird Measurer (Assessment)": 17,
}
magma_peak = {
    "Magma Peak - Level 1": 1,
    "Sandcastle Builder (Activity)": 2,
    "Slop Problem": 3,
    "Scrub-A-Dub": 4,
    "Watering Hole (Activity)": 5,
    "Magma Peak - Level 2": 6,
    "Dino Drink": 7,
    "Bubble Bath": 8,
    "Bottle Filler (Activity)": 9,
    "Dino Dive": 10,
    "Cauldron Filler (Assessment)": 11,
}
crystal_caves = {
    "Crystal Caves - Level 1": 1,
    "Chow Time": 2,
    "Balancing Act": 3,
    "Chicken Balancer (Activity)": 4,
    "Lifting Heavy Things": 5,
    "Crystal Caves - Level 2": 6,
    "Honey Cake": 7,
    "Happy Camel": 8,
    "Cart Balancer (Assessment)": 9,
    "Leaf Leader": 10,
    "Crystal Caves - Level 3": 11,
    "Heavy, Heavier, Heaviest": 12,
    "Pan Balance": 13,
    "Egg Dropper (Activity)": 14,
    "Chest Sorter (Assessment)": 15,
}

In [12]:
clip_time = {'Welcome to Lost Lagoon!':19,'Tree Top City - Level 1':17,'Ordering Spheres':61, 'Costume Box':61,
        '12 Monkeys':109,'Tree Top City - Level 2':25, 'Pirate\'s Tale':80, 'Treasure Map':156,'Tree Top City - Level 3':26,
        'Rulers':126, 'Magma Peak - Level 1':20, 'Slop Problem':60, 'Magma Peak - Level 2':22, 'Crystal Caves - Level 1':18,
        'Balancing Act':72, 'Lifting Heavy Things':118,'Crystal Caves - Level 2':24, 'Honey Cake':142, 'Crystal Caves - Level 3':19,
        'Heavy, Heavier, Heaviest':61}

In [13]:
def get_data(user_sample, test_set=False):
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {"acc_group_0":0, "acc_group_1":0, "acc_group_2":0, "acc_group_3":0}
    game_time_dict = {'Clip_gametime':0, 'Game_gametime':0, 'Activity_gametime':0, 'Assessment_gametime':0}
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    
    last_world='NONE'
    last_activity_type='Clip'
    time_last_activity=None
    
    give_up={"give_up_"+assess:0 for assess in assess_titles}
    
    durations ={'ema_duration_'+eve:0 for eve in list_of_user_activities}
    
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    
    world_count={world:0 for world in list_of_worlds}
    event_code_count = {ev: 0 for ev in list_of_event_code}
    event_id_count = {eve: 0 for eve in list_of_event_id}
    title_count = {eve: 0 for eve in list_of_user_activities} 
    counter = 0
    assess_durations=[]
    tree_top_city_list = []
    magma_peak_list = []
    crystal_caves_list = []
    clip_durations = []

    installation_logger=data_logger(event_id_to_args)
    
    all_assessments = []
    for session_id, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_world = session['world'].iloc[0]
        
        if session_type == 'Clip':
            clip_durations.append((clip_time[session_title]))
        
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            features={}
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_id'] = session_id
            features['session_title'] = session_title
            features['session_world'] = session_world

            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups.copy())
            accuracy_groups["acc_group_"+str(features['accuracy_group'])] += 1   
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            features['ratio']=accumulated_correct_attempts/(accumulated_correct_attempts+accumulated_uncorrect_attempts) if (accumulated_correct_attempts+accumulated_uncorrect_attempts)!=0 else 0
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
    
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accumulated_accuracy += accuracy
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            features.update(last_accuracy_title.copy())
            last_accuracy_title['acc_' + session_title] = accuracy
        
            features['accumulated_actions'] = accumulated_actions
            features['last_world']=last_world
            features['last_activity_type']=last_activity_type
            features['time_to_last_activity']=(session.iloc[0, 2]-time_last_activity).seconds if time_last_activity is not None else 0
            
            features.update(event_code_count.copy())

            features.update(installation_logger.get_data())
  
            features.update(title_count.copy())
            features.update(durations.copy())
            features.update(user_activities_count.copy())
            features.update(event_id_count.copy())
            features.update(give_up.copy())
            features.update(world_count.copy())
            features.update(game_time_dict.copy())
            
            if assess_durations == []:
                features['assess_duration_mean'] = 0
            else:
                features['assess_duration_mean'] = np.mean(assess_durations)
            assess_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
            
            if true_attempts+false_attempts==0:
                give_up["give_up_"+session_title]+=1
                
            if tree_top_city_list == []:
                features['tree_top_city_max'] = 0
                features['tree_top_city_cnt'] = 0
                features['tree_top_city_cover'] = 0
            else:
                features['tree_top_city_max'] = np.max(tree_top_city_list)
                features['tree_top_city_cnt'] = len(tree_top_city_list)
                features['tree_top_city_cover'] = float(len(set(tree_top_city_list))) / len(tree_top_city)
            if magma_peak_list == []:
                features['magma_peak_max'] = 0
                features['magma_peak_cnt'] = 0
                features['magma_peak_cover'] = 0
            else:
                features['magma_peak_max'] = np.max(magma_peak_list)
                features['magma_peak_cnt'] = len(magma_peak_list)
                features['magma_peak_cover'] = float(len(set(magma_peak_list))) / len(magma_peak)
            if crystal_caves_list == []:
                features['crystal_caves_max'] = 0
                features['crystal_caves_cnt'] = 0
                features['crystal_caves_cover'] = 0
            else:
                features['crystal_caves_max'] = np.max(crystal_caves_list)
                features['crystal_caves_cnt'] = len(crystal_caves_list)
                features['crystal_caves_cover'] = float(len(set(crystal_caves_list))) / len(crystal_caves)
            if session_title in tree_top_city:
                last_game = tree_top_city[session_title] - 1
                if last_game in tree_top_city_list:
                    features['played_last_game'] = 1
                else:
                    features['played_last_game'] = 0
            elif session_title in magma_peak:
                last_game = magma_peak[session_title] - 1
                if last_game in magma_peak_list:
                    features['played_last_game'] = 1
                else:
                    features['played_last_game'] = 0
            elif session_title in crystal_caves:
                last_game = crystal_caves[session_title] - 1
                if last_game in crystal_caves_list:
                    features['played_last_game'] = 1
                else:
                    features['played_last_game'] = 0
            else:
                features['played_last_game'] = 0
            
            if clip_durations == []:
                features['Clip_duration_mean'] = 0
                features['Clip_duration_std'] = 0
            else:
                features['Clip_duration_mean'] = np.mean(clip_durations)
                features['Clip_duration_std'] = np.std(clip_durations)
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
            counter += 1
        
        #log event data

        installation_logger.log_data(session)
        #update counters
        def update_counters(counter: dict, col: str):
            num_of_session_count = Counter(session[col])
            for k in num_of_session_count.keys():
                if counter.get(k) is not None:
                    counter[k] += num_of_session_count[k]
            return counter
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        
        title_count[session_title]+=1
        #title_event_code_count = update_counters(title_event_code_count, 'title_event_code')
        
        session_duration=(session.iloc[-1, 2] - session.iloc[0, 2]).seconds
        durations['ema_duration_'+session_title]= session_duration if durations['ema_duration_'+session_title]==0 else (0.2*session_duration+0.8*durations['ema_duration_'+session_title])
        accumulated_actions += len(session)

        user_activities_count[session_type] += 1
        world_count[session_world]+=1
        last_world = session_world 
        last_activity_type = session_type
        time_last_activity= session.iloc[-1, 2]  
        
        game_time_dict[session_type+'_gametime'] = (game_time_dict[session_type+'_gametime'] + (session['game_time'].iloc[-1]/1000.0))/2.0
        
        # sequence features update
        if session_title in tree_top_city:
            tree_top_city_list.append(tree_top_city[session_title])
        if session_title in magma_peak:
            magma_peak_list.append(magma_peak[session_title])
        if session_title in crystal_caves:
            crystal_caves_list.append(crystal_caves[session_title])
        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

def get_train_and_test(train, test):
    compiled_train = []
    compiled_test = []
    for i, (ins_id, user_sample) in enumerate(tqdm(train.groupby('installation_id', sort = False))):
        compiled_train += get_data(user_sample)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False)):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test = pd.DataFrame(compiled_test)
    return reduce_train, reduce_test          

reduce_train, reduce_test= get_train_and_test(train, test)
reduce_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_train.columns]
reduce_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_test.columns]
assess_titles=["".join (c if c.isalnum() else "_" for c in str(x)) for x in assess_titles]


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=3614), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [14]:
type_enc={'Clip':0, 'Activity': 1, 'Assessment': 2, 'Game':3}
reduce_train['session_title']=reduce_train['session_title'].replace(title_enc).astype(int)
reduce_train['session_world']=reduce_train['session_world'].replace(world_enc).astype(int)
reduce_train['last_world']=reduce_train['last_world'].replace(world_enc).astype(int)
reduce_train['last_activity_type']=reduce_train['last_activity_type'].replace(type_enc).astype(int)

reduce_test['session_title']=reduce_test['session_title'].replace(title_enc).astype(int)
reduce_test['session_world']=reduce_test['session_world'].replace(world_enc).astype(int)
reduce_test['last_world']=reduce_test['last_world'].replace(world_enc).astype(int)
reduce_test['last_activity_type']=reduce_test['last_activity_type'].replace(type_enc).astype(int)

In [15]:
freq=reduce_train['session_title'].value_counts()
frequency_enc=dict(zip(freq.index,freq))
reduce_train['session_title']=reduce_train['session_title'].replace(frequency_enc).astype(int)
reduce_test['session_title']=reduce_test['session_title'].replace(frequency_enc).astype(int)
reduce_train_true=reduce_train.copy()
reduce_test_true=reduce_test.copy()

Get score

In [16]:
del train,test
gc.collect()

6242

In [17]:
def read_data():
    if DEBUG:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv', nrows=100000)
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv', nrows=100000)
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    else:
        print('Reading train.csv file....')
        train = pd.read_csv(dirs+'train.csv')
        print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

        print('Reading test.csv file....')
        test = pd.read_csv(dirs+'test.csv')
        print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv(dirs+'train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv(dirs+'specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv(dirs+'sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission
train, test, train_labels, specs, sample_submission = read_data()
train = train[train.installation_id.isin(train_labels.installation_id.unique())].reset_index(drop=True)
list_of_event_args = list(set(specs['args'].unique()))
event_args_map = dict(zip(list_of_event_args, np.arange(len(list_of_event_args))))
specs["args"]=specs["args"].map(event_args_map)

list_of_event_info = list(set(specs['info'].unique()))
event_info_map = dict(zip(list_of_event_info, np.arange(len(list_of_event_info))))
specs["info"]=specs["info"].map(event_info_map)

args_list=specs["args"].value_counts().add_prefix('args_').index.tolist()
args_label=dict(zip(np.arange(len(args_list)), args_list))
info_list=specs["info"].value_counts().add_prefix('info_').index.tolist()
info_label=dict(zip(np.arange(len(info_list)), info_list))

train=pd.merge(train,specs,on=["event_id"],how="left")
test=pd.merge(test,specs,on=["event_id"],how="left")
print(train.shape)

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns
Reading specs.csv file....
Specs.csv file have 386 rows and 3 columns
Reading sample_submission.csv file....
Sample_submission.csv file have 1000 rows and 2 columns
(7734558, 13)


In [18]:
#Credits go to Andrew Lukyanenko

def encode_title(train, test, train_labels):
    # encode title

    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = sorted(list(set(train['title'].unique()).union(set(test['title'].unique()))))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = sorted(list(set(train['event_code'].unique()).union(set(test['event_code'].unique()))))
    list_of_event_id = sorted(list(set(train['event_id'].unique()).union(set(test['event_id'].unique()))))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = sorted(list(set(train['world'].unique()).union(set(test['world'].unique()))))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = sorted(list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index))))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = sorted(list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique())))
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code

# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code = encode_title(train, test, train_labels)

categoricals = ['session_title']

In [19]:
import json
def cnt_miss(df):
    cnt = 0
    for e in range(len(df)):
        x = df['event_data'].iloc[e]
        y = json.loads(x)['misses']
        cnt += y
    return cnt

def update_counters(counter: dict, col: str,session):
    num_of_session_count = Counter(session[col])
    for k in num_of_session_count.keys():
        x = k
        counter[x] += num_of_session_count[k]
    return counter

def update_counters_event(counter: dict, col: str,session):
    num_of_session_count = session[col].value_counts().add_prefix(col+'_').to_dict()
    for k in num_of_session_count.keys():
        counter[k] += num_of_session_count[k]
    return counter

def get_data123(user_sample,lab, test_set=False):
    all_assessments=[]
    compiled_data=[]
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        Assessment_time=session['timestamp'].iloc[0]
        event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
        event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
        event_info_count: Dict[str, int] = {eve: 0 for eve in info_list}
        
        if session_type==lab:
            features = {}
            features["game_title"]=session_title
            features["game_event_count"]=session['event_count'].iloc[-1]
            features["game_game_time"]=session['game_time'].iloc[-1]
            event_code_count = update_counters(event_code_count, "event_code",session)
            #event_id_count = update_counters(event_id_count, "event_id",session)
            event_info_count = update_counters_event(event_info_count, "info",session)

            
            features.update(event_code_count.copy())
            features.update(event_info_count.copy())
            compiled_data.append(features)
           
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
         
             
          
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            if accuracy == 0:
                accuracy_group = 0
            elif accuracy == 1:
                accuracy_group = 3
            elif accuracy == 0.5:
                accuracy_group = 2
            else:
                accuracy_group = 1
            if test_set:
                if true_attempts+false_attempts > 0:
                    reduce_game = pd.DataFrame(compiled_data)
                    reduce_game['installation_id'] = session['installation_id'].iloc[-1]
                    reduce_game["game_session"]=session['game_session'].iloc[-1]
                    reduce_game["session_title"]=session_title
                    reduce_game["true_attempts"]=true_attempts
                    reduce_game["false_attempts"]=false_attempts
                    reduce_game["accuracy"]=accuracy
                    reduce_game["accuracy_group"]=accuracy_group
                    all_assessments.append(reduce_game)
                elif len(session)==1:
                    reduce_game = pd.DataFrame(compiled_data)
                    reduce_game['installation_id'] = session['installation_id'].iloc[-1]
                    reduce_game["game_session"]=session['game_session'].iloc[-1]
                    reduce_game["session_title"]=session_title
                    reduce_game["true_attempts"]=666
                    reduce_game["false_attempts"]=666
                    reduce_game["accuracy"]=666
                    reduce_game["accuracy_group"]=666
                    all_assessments.append(reduce_game)
                    
            
            elif true_attempts+false_attempts > 0:
                reduce_game = pd.DataFrame(compiled_data)
                reduce_game['installation_id'] = session['installation_id'].iloc[-1]
                reduce_game["game_session"]=session['game_session'].iloc[-1]
                reduce_game["session_title"]=session_title
                reduce_game["true_attempts"]=true_attempts
                reduce_game["false_attempts"]=false_attempts
                reduce_game["accuracy"]=accuracy
                reduce_game["accuracy_group"]=accuracy_group
                all_assessments.append(reduce_game)
    
    return all_assessments

In [20]:
compiled_data = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Game")
reduce_train = pd.concat(compiled_data)
reduce_train.shape

compiled_data = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Game", test_set=True)
reduce_test_all = pd.concat(compiled_data)
reduce_test_all.shape

reduce_test=reduce_test_all[reduce_test_all["accuracy"]==666].copy().reset_index(drop=True)
reduce_train=pd.concat([reduce_train,reduce_test_all[reduce_test_all["accuracy"]!=666]]).reset_index(drop=True)
reduce_train.shape

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


(238023, 220)

In [21]:
reduce_train.head()

Unnamed: 0,game_event_count,game_game_time,game_title,2000,2010,2020,2025,2030,2035,2040,2050,2060,2070,2075,2080,2081,2083,3010,3020,3021,3110,3120,3121,4010,4020,4021,4022,4025,4030,4031,4035,4040,4045,4050,4070,4080,4090,4095,4100,4110,4220,4230,4235,5000,5010,info_139,info_40,info_57,info_79,info_12,info_44,info_28,info_140,info_15,info_67,info_166,info_113,info_85,info_144,info_60,info_62,info_91,info_30,info_143,info_80,info_102,info_66,info_23,info_94,info_58,info_42,info_2,info_61,info_18,info_51,info_27,info_95,info_53,info_38,info_48,info_104,info_64,info_142,info_17,info_11,info_10,info_9,info_69,info_78,info_164,info_110,info_120,info_52,info_167,info_54,info_55,info_56,info_59,info_50,info_65,info_68,info_70,info_71,info_63,info_39,info_49,info_24,info_1,info_3,info_4,info_5,info_6,info_7,info_8,info_13,info_14,info_16,info_19,info_20,info_21,info_22,info_25,info_47,info_26,info_29,info_31,info_32,info_33,info_34,info_35,info_36,info_37,info_73,info_41,info_43,info_45,info_46,info_72,info_83,info_74,info_147,info_145,info_141,info_138,info_137,info_136,info_135,info_134,info_133,info_132,info_131,info_130,info_129,info_128,info_127,info_126,info_146,info_148,info_75,info_149,info_165,info_163,info_162,info_161,info_160,info_159,info_158,info_157,info_156,info_155,info_154,info_153,info_152,info_151,info_150,info_125,info_124,info_123,info_122,info_98,info_97,info_96,info_93,info_92,info_90,info_89,info_88,info_87,info_86,info_84,info_82,info_81,info_77,info_76,info_99,info_100,info_101,info_114,info_121,info_119,info_118,info_117,info_116,info_115,info_112,info_103,info_111,info_109,info_108,info_107,info_106,info_105,info_0,installation_id,game_session,session_title,true_attempts,false_attempts,accuracy,accuracy_group
0,131.0,115792.0,36.0,1.0,0.0,15.0,0.0,15.0,0.0,6.0,6.0,0.0,0.0,0.0,4.0,1.0,2.0,15.0,3.0,6.0,15.0,3.0,6.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15.0,15.0,0.0,1.0,0.0,3.0,6.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,14.0,15.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0006a69f,901acc108f55a5a1,30,1,0,1.0,3
1,28.0,30128.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0006a69f,901acc108f55a5a1,30,1,0,1.0,3
2,35.0,34700.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,4.0,1.0,0.0,1.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0006a69f,901acc108f55a5a1,30,1,0,1.0,3
3,90.0,99949.0,2.0,1.0,0.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,3.0,4.0,3.0,3.0,1.0,21.0,0.0,0.0,0.0,22.0,0.0,1.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0006a69f,901acc108f55a5a1,30,1,0,1.0,3
4,131.0,115792.0,36.0,1.0,0.0,15.0,0.0,15.0,0.0,6.0,6.0,0.0,0.0,0.0,4.0,1.0,2.0,15.0,3.0,6.0,15.0,3.0,6.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15.0,15.0,0.0,1.0,0.0,3.0,6.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,14.0,15.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0006a69f,77b8ee947eb84b4e,4,0,11,0.0,0


In [22]:
reduce_test.head()

Unnamed: 0,game_event_count,game_game_time,game_title,2000,2010,2020,2025,2030,2035,2040,2050,2060,2070,2075,2080,2081,2083,3010,3020,3021,3110,3120,3121,4010,4020,4021,4022,4025,4030,4031,4035,4040,4045,4050,4070,4080,4090,4095,4100,4110,4220,4230,4235,5000,5010,info_139,info_40,info_57,info_79,info_12,info_44,info_28,info_140,info_15,info_67,info_166,info_113,info_85,info_144,info_60,info_62,info_91,info_30,info_143,info_80,info_102,info_66,info_23,info_94,info_58,info_42,info_2,info_61,info_18,info_51,info_27,info_95,info_53,info_38,info_48,info_104,info_64,info_142,info_17,info_11,info_10,info_9,info_69,info_78,info_164,info_110,info_120,info_52,info_167,info_54,info_55,info_56,info_59,info_50,info_65,info_68,info_70,info_71,info_63,info_39,info_49,info_24,info_1,info_3,info_4,info_5,info_6,info_7,info_8,info_13,info_14,info_16,info_19,info_20,info_21,info_22,info_25,info_47,info_26,info_29,info_31,info_32,info_33,info_34,info_35,info_36,info_37,info_73,info_41,info_43,info_45,info_46,info_72,info_83,info_74,info_147,info_145,info_141,info_138,info_137,info_136,info_135,info_134,info_133,info_132,info_131,info_130,info_129,info_128,info_127,info_126,info_146,info_148,info_75,info_149,info_165,info_163,info_162,info_161,info_160,info_159,info_158,info_157,info_156,info_155,info_154,info_153,info_152,info_151,info_150,info_125,info_124,info_123,info_122,info_98,info_97,info_96,info_93,info_92,info_90,info_89,info_88,info_87,info_86,info_84,info_82,info_81,info_77,info_76,info_99,info_100,info_101,info_114,info_121,info_119,info_118,info_117,info_116,info_115,info_112,info_103,info_111,info_109,info_108,info_107,info_106,info_105,info_0,installation_id,game_session,session_title,true_attempts,false_attempts,accuracy,accuracy_group
0,159.0,135794.0,12.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,28.0,0.0,2.0,28.0,0.0,1.0,17.0,0.0,0.0,0.0,38.0,0.0,21.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,28.0,28.0,1.0,0.0,0.0,2.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00abaee7,348d7f09f96af313,9,666,666,666.0,666
1,79.0,1960630.0,2.0,1.0,0.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1.0,2.0,8.0,1.0,2.0,1.0,13.0,0.0,0.0,0.0,17.0,0.0,4.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,00abaee7,348d7f09f96af313,9,666,666,666.0,666
2,135.0,188805.0,18.0,1.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,18.0,4.0,6.0,17.0,4.0,6.0,1.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,17.0,6.0,4.0,4.0,1.0,0.0,6.0,18.0,54.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00abaee7,348d7f09f96af313,9,666,666,666.0,666
3,126.0,162082.0,36.0,1.0,0.0,15.0,0.0,15.0,0.0,6.0,6.0,0.0,0.0,0.0,5.0,2.0,3.0,17.0,0.0,6.0,17.0,0.0,6.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,17.0,17.0,0.0,1.0,0.0,0.0,6.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,11.0,15.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,01242218,1fef5d54cb4b775a,8,666,666,666.0,666
4,111.0,159754.0,19.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,7.0,0.0,11.0,7.0,0.0,11.0,1.0,10.0,0.0,0.0,0.0,11.0,11.0,0.0,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,11.0,0.0,0.0,1.0,0.0,11.0,7.0,33.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,01242218,1fef5d54cb4b775a,8,666,666,666.0,666


In [23]:
feat=[f for f in reduce_train.columns if f not in ['installation_id', 'game_session','true_attempts', 'false_attempts', 'accuracy', 'accuracy_group']]
print(feat)

['game_event_count', 'game_game_time', 'game_title', 2000, 2010, 2020, 2025, 2030, 2035, 2040, 2050, 2060, 2070, 2075, 2080, 2081, 2083, 3010, 3020, 3021, 3110, 3120, 3121, 4010, 4020, 4021, 4022, 4025, 4030, 4031, 4035, 4040, 4045, 4050, 4070, 4080, 4090, 4095, 4100, 4110, 4220, 4230, 4235, 5000, 5010, 'info_139', 'info_40', 'info_57', 'info_79', 'info_12', 'info_44', 'info_28', 'info_140', 'info_15', 'info_67', 'info_166', 'info_113', 'info_85', 'info_144', 'info_60', 'info_62', 'info_91', 'info_30', 'info_143', 'info_80', 'info_102', 'info_66', 'info_23', 'info_94', 'info_58', 'info_42', 'info_2', 'info_61', 'info_18', 'info_51', 'info_27', 'info_95', 'info_53', 'info_38', 'info_48', 'info_104', 'info_64', 'info_142', 'info_17', 'info_11', 'info_10', 'info_9', 'info_69', 'info_78', 'info_164', 'info_110', 'info_120', 'info_52', 'info_167', 'info_54', 'info_55', 'info_56', 'info_59', 'info_50', 'info_65', 'info_68', 'info_70', 'info_71', 'info_63', 'info_39', 'info_49', 'info_24', 'i

In [24]:
from numba import jit
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk(y_true, y_pred):
    
#     y_pred[y_pred <= 1.12232214] = 0
#     y_pred[np.where(np.logical_and(y_pred > 1.12232214, y_pred <= 1.73925866))] = 1
#     y_pred[np.where(np.logical_and(y_pred > 1.73925866, y_pred <= 2.22506454))] = 2
#     y_pred[y_pred > 2.22506454] = 3
    #coeff=[1.23795619,1.74348425,2.23639873]
    coeff=[1.12934881,1.69659649,2.204893]
    y_pred[y_pred <= coeff[0]] = 0
    y_pred[np.where(np.logical_and(y_pred > coeff[0], y_pred <= coeff[1]))] = 1
    y_pred[np.where(np.logical_and(y_pred > coeff[1], y_pred <= coeff[2]))] = 2
    y_pred[y_pred > coeff[2]] = 3
    
#     y_pred[y_pred <= 0.94892782] = 0
#     y_pred[np.where(np.logical_and(y_pred > 0.94892782, y_pred <= 1.69))] = 1
#     y_pred[np.where(np.logical_and(y_pred > 1.69, y_pred <= 2.16))] = 2
#     y_pred[y_pred >2.16] = 3
    
    return qwk(y_true, y_pred)
import lightgbm as lgb
def lgb_model(reduce_train,reduce_test,feature,random_state):
    
    params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'eval_metric': 'cappa',
    #'metric': 'None',
    'num_threads':-1,
    'seed': random_state,
    'learning_rate':0.05,
    'max_depth': 11,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'colsample_bytree':0.6,
    'verbose': 100
    }
    # Additional parameters:
    early_stop = 50
    verbose_eval = 100
    num_rounds = 10000
    n_splits = 5

    from sklearn.model_selection import KFold,GroupKFold,GroupShuffleSplit,StratifiedKFold
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    models = []
    scores=[]
    y_trian = reduce_train['accuracy_group']
    oof_train = np.zeros((reduce_train.shape[0]))
    oof_test = np.zeros((reduce_test.shape[0]))
    for train_index,valid_index in kf.split(reduce_train, y_trian):
    
        train_features = reduce_train.loc[train_index]
        train_target = y_trian.loc[train_index]
    
        val_features = reduce_train.loc[valid_index]
        val_target = y_trian.loc[valid_index]
    
        X_train = train_features[feature].values
        X_val = val_features[feature].values
    
        d_train = lgb.Dataset(X_train, label=train_target)
        d_valid = lgb.Dataset(X_val, label=val_target)
        watchlist = [d_train, d_valid]
    
        print('training LGB:')
        model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    
        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        test_pred = model.predict(reduce_test[feature].values, num_iteration=model.best_iteration)
        scores.append(eval_qwk(val_target,copy.deepcopy(val_pred)))
        print(scores)
        oof_train[valid_index] = val_pred
        oof_test += test_pred/n_splits
    print(np.mean(scores))
    return oof_train,oof_test
print(reduce_train.shape,reduce_test.shape)
oof_train_one,oof_test_one=lgb_model(reduce_train.reset_index(drop=True),reduce_test.copy().reset_index(drop=True),feat,50)

(238023, 220) (5119, 220)
training LGB:
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 1.06255	valid_1's rmse: 1.06967
[200]	training's rmse: 1.04684	valid_1's rmse: 1.05819
[300]	training's rmse: 1.03562	valid_1's rmse: 1.05047
[400]	training's rmse: 1.02717	valid_1's rmse: 1.04534
[500]	training's rmse: 1.01993	valid_1's rmse: 1.04075
[600]	training's rmse: 1.01328	valid_1's rmse: 1.03694
[700]	training's rmse: 1.00779	valid_1's rmse: 1.03374
[800]	training's rmse: 1.00194	valid_1's rmse: 1.03048
[900]	training's rmse: 0.996388	valid_1's rmse: 1.02726
[1000]	training's rmse: 0.99148	valid_1's rmse: 1.02457
[1100]	training's rmse: 0.987138	valid_1's rmse: 1.02253
[1200]	training's rmse: 0.982818	valid_1's rmse: 1.02026
[1300]	training's rmse: 0.978677	valid_1's rmse: 1.01819
[1400]	training's rmse: 0.975177	valid_1's rmse: 1.01652
[1500]	training's rmse: 0.970909	valid_1's rmse: 1.01428
[1600]	training's rmse: 0.967525	valid_1's rmse: 1.01293
[1700

Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-24-22f4045d445e> (12)[0m
[1m
File "<ipython-input-24-22f4045d445e>", line 12:[0m
[1mdef qwk(a1, a2):
    <source elided>
    """
[1m    max_rat = 3
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "qwk" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-24-22f4045d445e>", line 20:[0m
[1mdef qwk(a1, a2):
    <source elided>
    o = 0
[1m    for k in range(a1.shape[0]):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-24-22f4045d445e>", line 3:[0m
[1m@jit
[1mdef qwk(a1, a2):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation 

[0.6276375712392132]
training LGB:
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 1.06404	valid_1's rmse: 1.0632
[200]	training's rmse: 1.04832	valid_1's rmse: 1.05083
[300]	training's rmse: 1.03839	valid_1's rmse: 1.04431
[400]	training's rmse: 1.02962	valid_1's rmse: 1.03825
[500]	training's rmse: 1.02222	valid_1's rmse: 1.03349
[600]	training's rmse: 1.01519	valid_1's rmse: 1.02916
[700]	training's rmse: 1.00893	valid_1's rmse: 1.02551
[800]	training's rmse: 1.00365	valid_1's rmse: 1.02271
[900]	training's rmse: 0.998802	valid_1's rmse: 1.0199
[1000]	training's rmse: 0.994346	valid_1's rmse: 1.01742
[1100]	training's rmse: 0.989639	valid_1's rmse: 1.01482
[1200]	training's rmse: 0.985418	valid_1's rmse: 1.01244
[1300]	training's rmse: 0.981465	valid_1's rmse: 1.01057
[1400]	training's rmse: 0.977667	valid_1's rmse: 1.00872
[1500]	training's rmse: 0.973776	valid_1's rmse: 1.00684
[1600]	training's rmse: 0.970159	valid_1's rmse: 1.00517
[1700]	trai

In [25]:
train_feature_score = reduce_train[['installation_id', 'game_session']].copy()
train_feature_score['score'] = oof_train_one
test_feature_score = reduce_test[['installation_id', 'game_session']].copy()
test_feature_score['score'] = oof_test_one
feature_score = pd.concat([train_feature_score, test_feature_score])
feature_agg=feature_score.groupby(["game_session","installation_id"]).agg({'score': ['count','mean', 'sum', 'max','min','var']}).reset_index()
feature_agg.columns=["game_session","installation_id",'score_count','score_mean', 'score_sum', 'score_max', 'score_min', 'score_var']
print(feature_agg.shape)
feature_agg.head()

(18767, 8)


Unnamed: 0,game_session,installation_id,score_count,score_mean,score_sum,score_max,score_min,score_var
0,00097cda27afb726,01bdd720,8,2.384952,19.079615,3.044551,1.401693,0.255159
1,000f68cff32664ef,3f0dca37,11,0.612313,6.735447,0.922446,0.395871,0.038716
2,0014403daadf67aa,29d1aaee,25,1.597737,39.943433,2.143871,0.714214,0.114874
3,0014daa1d3e26eb2,55fdf49f,4,2.488505,9.954021,2.725008,2.263492,0.035995
4,001a139acd7fce92,285b65c8,3,1.422593,4.267778,2.078457,1.028137,0.327044


In [26]:
del train_feature_score,test_feature_score,feature_score
del reduce_train,reduce_test,reduce_test_all,compiled_data
gc.collect()

48961

In [27]:
compiled_data = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Activity")
reduce_train = pd.concat(compiled_data)

compiled_data = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=test.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data123(user_sample,"Activity", test_set=True)
reduce_test_all = pd.concat(compiled_data)

reduce_test=reduce_test_all[reduce_test_all["accuracy"]==666].copy().reset_index(drop=True)
reduce_train=pd.concat([reduce_train,reduce_test_all[reduce_test_all["accuracy"]!=666]]).reset_index(drop=True)
reduce_train.shape

feat=[f for f in reduce_train.columns if f not in ['installation_id', 'game_session','true_attempts', 'false_attempts', 'accuracy', 'accuracy_group']]
print(feat)

print(reduce_train.shape,reduce_test.shape)
oof_train_one,oof_test_one=lgb_model(reduce_train.reset_index(drop=True),reduce_test.copy().reset_index(drop=True),feat,50)

train_feature_score = reduce_train[['installation_id', 'game_session']].copy()
train_feature_score['score'] = oof_train_one
test_feature_score = reduce_test[['installation_id', 'game_session']].copy()
test_feature_score['score'] = oof_test_one
feature_score = pd.concat([train_feature_score, test_feature_score])

feature_agg_Activity=feature_score.groupby(["game_session","installation_id"]).agg({'score': ['count','mean', 'sum', 'max','min','var']}).reset_index()
feature_agg_Activity.columns=["game_session","installation_id",'score_countAct','score_meanAct', 'score_sumAct', 'score_maxAct', 'score_minAct', 'score_varAct']
print(feature_agg_Activity.shape)
feature_agg_Activity.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, description='Installation_id', max=3614, style=ProgressStyle(description_w…




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(IntProgress(value=0, description='Installation_id', max=1000, style=ProgressStyle(description_w…




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


['game_event_count', 'game_game_time', 'game_title', 2000, 2010, 2020, 2025, 2030, 2035, 2040, 2050, 2060, 2070, 2075, 2080, 2081, 2083, 3010, 3020, 3021, 3110, 3120, 3121, 4010, 4020, 4021, 4022, 4025, 4030, 4031, 4035, 4040, 4045, 4050, 4070, 4080, 4090, 4095, 4100, 4110, 4220, 4230, 4235, 5000, 5010, 'info_139', 'info_40', 'info_57', 'info_79', 'info_12', 'info_44', 'info_28', 'info_140', 'info_15', 'info_67', 'info_166', 'info_113', 'info_85', 'info_144', 'info_60', 'info_62', 'info_91', 'info_30', 'info_143', 'info_80', 'info_102', 'info_66', 'info_23', 'info_94', 'info_58', 'info_42', 'info_2', 'info_61', 'info_18', 'info_51', 'info_27', 'info_95', 'info_53', 'info_38', 'info_48', 'info_104', 'info_64', 'info_142', 'info_17', 'info_11', 'info_10', 'info_9', 'info_69', 'info_78', 'info_164', 'info_110', 'info_120', 'info_52', 'info_167', 'info_54', 'info_55', 'info_56', 'info_59', 'info_50', 'info_65', 'info_68', 'info_70', 'info_71', 'info_63', 'info_39', 'info_49', 'info_24', 'i

Unnamed: 0,game_session,installation_id,score_countAct,score_meanAct,score_sumAct,score_maxAct,score_minAct,score_varAct
0,00097cda27afb726,01bdd720,6,2.442337,14.654023,2.709503,2.063106,0.080122
1,000f68cff32664ef,3f0dca37,3,0.618903,1.856708,0.840518,0.508065,0.036835
2,0014403daadf67aa,29d1aaee,22,1.684734,37.064154,2.136437,1.160214,0.080963
3,0014daa1d3e26eb2,55fdf49f,4,2.386142,9.544566,2.601206,2.136138,0.050622
4,001a139acd7fce92,285b65c8,3,1.199295,3.597884,2.087022,0.146938,0.961309


In [28]:
del train_feature_score,test_feature_score,feature_score
del reduce_train,reduce_test,reduce_test_all,compiled_data
gc.collect()

70

In [29]:
reduce_train_true.head()

Unnamed: 0,installation_id,session_id,session_title,session_world,accuracy_group,acc_group_0,acc_group_1,acc_group_2,acc_group_3,accumulated_correct_attempts,accumulated_uncorrect_attempts,ratio,accumulated_accuracy,accumulated_accuracy_group,acc_Bird_Measurer__Assessment_,acc_Cart_Balancer__Assessment_,acc_Cauldron_Filler__Assessment_,acc_Chest_Sorter__Assessment_,acc_Mushroom_Sorter__Assessment_,accumulated_actions,last_world,last_activity_type,time_to_last_activity,2000,2010,2020,2025,2030,2035,2040,2050,2060,2070,2075,2080,2081,2083,3010,3020,3021,3110,3120,3121,4010,4020,4021,4022,4025,4030,4031,4035,4040,4045,4050,4070,4080,4090,4095,4100,4110,4220,4230,4235,5000,5010,Scrub_A_Dub4020_4025_round_max_ema,Scrub_A_Dub_2030_misses_ema_ema,Scrub_A_Dub_2030_round_max_ema,Scrub_A_Dub_2030_duration_ema_ema,Scrub_A_Dub4020_4025_correct_count_true_ema,Scrub_A_Dub4020_4025_correct_count_false_ema,Scrub_A_Dub4020_4025_correct_count_accuracy,All_Star_Sorting4020_4025_round_max_ema,All_Star_Sorting4020_4025_correct_count_true_ema,All_Star_Sorting4020_4025_correct_count_false_ema,All_Star_Sorting4020_4025_correct_count_accuracy,All_Star_Sorting_2030_misses_ema_ema,All_Star_Sorting_2030_round_max_ema,All_Star_Sorting_2030_duration_ema_ema,12_Monkeys,Air_Show,All_Star_Sorting,Balancing_Act,Bird_Measurer__Assessment_,Bottle_Filler__Activity_,Bubble_Bath,Bug_Measurer__Activity_,Cart_Balancer__Assessment_,Cauldron_Filler__Assessment_,Chest_Sorter__Assessment_,Chicken_Balancer__Activity_,Chow_Time,Costume_Box,Crystal_Caves___Level_1,Crystal_Caves___Level_2,Crystal_Caves___Level_3,Crystals_Rule,Dino_Dive,Dino_Drink,Egg_Dropper__Activity_,Fireworks__Activity_,Flower_Waterer__Activity_,Happy_Camel,Heavy__Heavier__Heaviest,Honey_Cake,Leaf_Leader,Lifting_Heavy_Things,Magma_Peak___Level_1,Magma_Peak___Level_2,Mushroom_Sorter__Assessment_,Ordering_Spheres,Pan_Balance,Pirate_s_Tale,Rulers,Sandcastle_Builder__Activity_,Scrub_A_Dub,Slop_Problem,Treasure_Map,Tree_Top_City___Level_1,Tree_Top_City___Level_2,Tree_Top_City___Level_3,Watering_Hole__Activity_,Welcome_to_Lost_Lagoon_,ema_duration_12_Monkeys,ema_duration_Air_Show,ema_duration_All_Star_Sorting,ema_duration_Balancing_Act,ema_duration_Bird_Measurer__Assessment_,ema_duration_Bottle_Filler__Activity_,ema_duration_Bubble_Bath,ema_duration_Bug_Measurer__Activity_,ema_duration_Cart_Balancer__Assessment_,ema_duration_Cauldron_Filler__Assessment_,ema_duration_Chest_Sorter__Assessment_,ema_duration_Chicken_Balancer__Activity_,ema_duration_Chow_Time,ema_duration_Costume_Box,ema_duration_Crystal_Caves___Level_1,ema_duration_Crystal_Caves___Level_2,ema_duration_Crystal_Caves___Level_3,ema_duration_Crystals_Rule,ema_duration_Dino_Dive,ema_duration_Dino_Drink,ema_duration_Egg_Dropper__Activity_,ema_duration_Fireworks__Activity_,ema_duration_Flower_Waterer__Activity_,ema_duration_Happy_Camel,ema_duration_Heavy__Heavier__Heaviest,ema_duration_Honey_Cake,ema_duration_Leaf_Leader,ema_duration_Lifting_Heavy_Things,ema_duration_Magma_Peak___Level_1,ema_duration_Magma_Peak___Level_2,ema_duration_Mushroom_Sorter__Assessment_,ema_duration_Ordering_Spheres,ema_duration_Pan_Balance,ema_duration_Pirate_s_Tale,ema_duration_Rulers,ema_duration_Sandcastle_Builder__Activity_,ema_duration_Scrub_A_Dub,ema_duration_Slop_Problem,ema_duration_Treasure_Map,ema_duration_Tree_Top_City___Level_1,ema_duration_Tree_Top_City___Level_2,ema_duration_Tree_Top_City___Level_3,ema_duration_Watering_Hole__Activity_,ema_duration_Welcome_to_Lost_Lagoon_,Clip,Activity,Assessment,Game,0086365d,00c73085,01ca3a3c,022b4259,02a42007,0330ab6a,0413e89d,04df9b66,05ad839b,06372577,070a5291,08fd73f3,08ff79ad,0a08139c,0ce40006,0d18d96c,0d1da71f,0db6d71d,119b5b02,1325467d,1340b8d7,1375ccb7,13f56524,14de4c5d,155f62a4,1575e76c,15a43e5b,15ba1109,15eb4a7d,15f99afc,160654fd,16667cc5,16dffff1,17113b36,19967db1,1996c610,1af8be29,1b54d27f,1bb5fbdb,1beb320a,1c178d24,1cc7cfca,1cf54632,1f19558b,222660ff,2230fab4,250513af,25fa8af4,262136f4,26a5a3dd,26fd2d99,27253bdc,28520915,28a4eb9a,28ed704e,28f975ea,29a42aea,29bdd9ba,29f54413,2a444e03,2a512369,2b058fe3,2b9272f4,2c4e6db0,2dc29e21,2dcad279,2ec694de,2fb91ec1,30614231,30df3273,31973d56,3323d7e9,33505eae,3393b68b,363c86c9,363d3849,36fa3ebe,37937459,37c53127,37db1c2f,37ee8496,38074c54,392e14df,3a4be871,3afb49e6,3afde5dd,3b2048ee,3babcb9b,3bb91ced,3bb91dda,3bf1cf26,3bfd1a65,3ccd3f02,3d0b9317,3d63345e,3d8c61b0,3dcdda7f,3ddc79c3,3dfd4aa4,3edf6747,3ee399c3,44cb4907,45d01abe,461eace6,46b50ba8,46cd75b4,47026d5f,47efca07,47f43a44,48349b14,4901243f,499edb7c,49ed92e9,4a09ace1,4a4c3d21,4b5efe37,4bb2f698,4c2ec19f,4d6737eb,4d911100,4e5fc6f5,4ef8cdd3,51102b85,51311d7a,5154fc30,5290eab1,532a2afb,5348fd84,53c6e11a,55115cbd,562cec5f,565a3990,56817e2b,56bcd38d,56cd3b43,5859dfb6,587b5989,58a0de5c,598f4598,5a848010,5b49460a,5be391b5,5c2f29ca,5c3d2b2f,5d042115,5de79a6a,5e109ec3,5e3ea25a,5e812b27,5f0eb72c,5f5b2617,6043a2b4,6077cc36,6088b756,611485c5,63f13dd7,65a38bf7,65abac75,67439901,67aa2ada,69fdac0a,6aeafed4,6bf9e3e1,6c517a88,6c930e6e,6cf7d25c,6d90d394,6f445b57,6f4adc4b,6f4bd64e,6f8106d9,7040c096,709b1251,71e712d8,71fe8f75,731c0cbe,736f9581,7372e1a5,73757a5e,7423acbc,74e5f8a7,7525289a,756e5507,763fc34e,76babcde,77261ab5,77c76bc5,77ead60d,792530f8,795e4a37,7961e599,7ab78247,7ad3efc6,7cf1bc53,7d093bf9,7d5c30a2,7da34a02,7dfe6d8a,7ec0c298,7f0836bf,7fd1ac25,804ee27f,828e68f9,832735e1,83c6c409,84538528,84b0e0c8,857f21c0,85d1b0de,85de926c,86ba578b,86c924c4,87d743c1,884228c8,88d4a5be,895865f3,89aace00,8ac7cce4,8af75982,8b757ab8,8d748b58,8d7e386c,8d84fa81,8f094001,8fee50e2,907a054b,90d848e0,90ea0bac,90efca10,91561152,923afab1,92687c59,93b353f2,93edfe2e,9554a50b,99abe2bb,99ea62f3,9b01374f,9b23e8ee,9b4001e4,9c5ef70c,9ce586dd,9d29771f,9d4e7b25,9de5e594,9e34ea74,9e4c8c7b,9e6b7fb5,9ed8f6da,9ee1c98c,a0faea5d,a1192f43,a16a373e,a1bbe385,a1e4395d,a29c5338,a2df0760,a44b10dc,a52b92d5,a592d54e,a5be6304,a5e9da97,a6d66e51,a76029ee,a7640a16,a8876db3,a8a78786,a8efe47b,ab3136ba,ab4ec3a4,abc5811c,ac92046e,acf5c23f,ad148f58,ad2fc29c,b012cd7f,b120f2ac,b1d5101d,b2dba42b,b2e5b0f1,b5053438,b74258a0,b7530680,b7dc8128,b80e5e84,b88f38da,bb3e370b,bbfe0445,bc8f2793,bcceccc6,bd612267,bd701df8,bdf49a58,beb0a7b9,bfc77bd6,c0415e5c,c189aaf2,c1cac9a2,c277e121,c2baf0bd,c51d8688,c54cf6c5,c58186bf,c6971acf,c7128948,c74f40cd,c7f7f0e1,c7fe2a55,c952eb01,ca11f653,cb1178ad,cb6010f8,cc5087a3,cdd22e43,cf7638f3,cf82af56,cfbd47c8,d02b7a8e,d06f75b5,d122731b,d185d3ea,d2278a3b,d2659ab4,d2e9262e,d3268efa,d3640339,d38c2fd7,d3f1e122,d45ed6a1,d51b1749,d88ca108,d88e8f25,d9c005dd,daac11b0,db02c830,dcaede90,dcb1663e,dcb55a27,de26c3a6,df4940d3,df4fe8b6,e04fb33d,e080a381,e37a2b78,e3ff61fb,e4d32835,e4f1efe6,e5734469,e57dd7af,e5c9df6f,e64e2cfd,e694a35b,e720d930,e7561dd2,e79f3763,e7e44842,e9c52111,ea296733,ea321fb1,eb2c19cd,ec138c1c,ecaab346,ecc36b7f,ecc6157f,f28c589a,f32856e4,f3cd5473,f50fc6c1,f54238ee,f56e0afc,f5b8c21a,f6947f54,f71c4741,f7e47413,f806dc10,f93fc684,fbaf3456,fcfdffb6,fd20ea40,give_up_Bird_Measurer__Assessment_,give_up_Cart_Balancer__Assessment_,give_up_Cauldron_Filler__Assessment_,give_up_Chest_Sorter__Assessment_,give_up_Mushroom_Sorter__Assessment_,CRYSTALCAVES,MAGMAPEAK,NONE,TREETOPCITY,Clip_gametime,Game_gametime,Activity_gametime,Assessment_gametime,assess_duration_mean,tree_top_city_max,tree_top_city_cnt,tree_top_city_cover,magma_peak_max,magma_peak_cnt,magma_peak_cover,crystal_caves_max,crystal_caves_cnt,crystal_caves_cover,played_last_game,Clip_duration_mean,Clip_duration_std,Mushroom_Sorter__Assessment__4040_duration_ema_ema,Mushroom_Sorter__Assessment__2030_misses_ema_ema,Mushroom_Sorter__Assessment__2030_duration_ema_ema,Mushroom_Sorter__Assessment_4100_4110_correct_count_true_ema,Mushroom_Sorter__Assessment_4100_4110_correct_count_false_ema,Mushroom_Sorter__Assessment_4100_4110_correct_count_accuracy,Mushroom_Sorter__Assessment_4020_4025_correct_count_true_ema,Mushroom_Sorter__Assessment_4020_4025_correct_count_false_ema,Mushroom_Sorter__Assessment_4020_4025_correct_count_accuracy,Air_Show4020_4025_round_max_ema,Air_Show_2030_misses_ema_ema,Air_Show_2030_round_max_ema,Air_Show_2030_duration_ema_ema,Air_Show4100_4110_correct_count_true_ema,Air_Show4100_4110_correct_count_false_ema,Air_Show4100_4110_correct_count_accuracy,Air_Show4020_4025_correct_count_true_ema,Air_Show4020_4025_correct_count_false_ema,Air_Show4020_4025_correct_count_accuracy,Crystals_Rule4020_4025_round_max_ema,Crystals_Rule_2030_misses_ema_ema,Crystals_Rule_2030_round_max_ema,Crystals_Rule_2030_duration_ema_ema,Crystals_Rule4020_4025_correct_count_true_ema,Crystals_Rule4020_4025_correct_count_false_ema,Crystals_Rule4020_4025_correct_count_accuracy,Bird_Measurer__Assessment_4020_4025_correct_count_true_ema,Bird_Measurer__Assessment_4020_4025_correct_count_false_ema,Bird_Measurer__Assessment_4020_4025_correct_count_accuracy,Bird_Measurer__Assessment_4100_4110_correct_count_true_ema,Bird_Measurer__Assessment_4100_4110_correct_count_false_ema,Bird_Measurer__Assessment_4100_4110_correct_count_accuracy,Dino_Drink4020_4025_round_max_ema,Dino_Drink_2030_misses_ema_ema,Dino_Drink_2030_round_max_ema,Dino_Drink_2030_duration_ema_ema,Dino_Drink4020_4025_correct_count_true_ema,Dino_Drink4020_4025_correct_count_false_ema,Dino_Drink4020_4025_correct_count_accuracy,Bubble_Bath4020_4025_round_max_ema,Bubble_Bath_2030_misses_ema_ema,Bubble_Bath_2030_round_max_ema,Bubble_Bath_2030_duration_ema_ema,Bubble_Bath4020_4025_correct_count_true_ema,Bubble_Bath4020_4025_correct_count_false_ema,Bubble_Bath4020_4025_correct_count_accuracy,Bottle_Filler__Activity_4020_4025_round_max_ema,Bottle_Filler__Activity__2030_round_max_ema,Bottle_Filler__Activity__2030_duration_ema_ema,Chow_Time4020_4025_round_max_ema,Chow_Time_2030_misses_ema_ema,Chow_Time_2030_round_max_ema,Chow_Time_2030_duration_ema_ema,Chow_Time4020_4025_correct_count_true_ema,Chow_Time4020_4025_correct_count_false_ema,Chow_Time4020_4025_correct_count_accuracy,Dino_Dive4020_4025_round_max_ema,Dino_Dive_2030_misses_ema_ema,Dino_Dive_2030_round_max_ema,Dino_Dive_2030_duration_ema_ema,Dino_Dive4020_4025_correct_count_true_ema,Dino_Dive4020_4025_correct_count_false_ema,Dino_Dive4020_4025_correct_count_accuracy,Cauldron_Filler__Assessment__2030_misses_ema_ema,Cauldron_Filler__Assessment__2030_duration_ema_ema,Cauldron_Filler__Assessment_4100_4110_correct_count_true_ema,Cauldron_Filler__Assessment_4100_4110_correct_count_false_ema,Cauldron_Filler__Assessment_4100_4110_correct_count_accuracy,Cauldron_Filler__Assessment_4020_4025_correct_count_true_ema,Cauldron_Filler__Assessment_4020_4025_correct_count_false_ema,Cauldron_Filler__Assessment_4020_4025_correct_count_accuracy,Happy_Camel4020_4025_round_max_ema,Happy_Camel_2030_misses_ema_ema,Happy_Camel_2030_round_max_ema,Happy_Camel_2030_duration_ema_ema,Happy_Camel4020_4025_correct_count_true_ema,Happy_Camel4020_4025_correct_count_false_ema,Happy_Camel4020_4025_correct_count_accuracy,Cart_Balancer__Assessment__2030_misses_ema_ema,Cart_Balancer__Assessment__2030_duration_ema_ema,Cart_Balancer__Assessment_4100_4110_correct_count_true_ema,Cart_Balancer__Assessment_4100_4110_correct_count_false_ema,Cart_Balancer__Assessment_4100_4110_correct_count_accuracy,Pan_Balance4020_4025_round_max_ema,Pan_Balance4020_4025_correct_count_true_ema,Pan_Balance4020_4025_correct_count_false_ema,Pan_Balance4020_4025_correct_count_accuracy,Pan_Balance4100_4110_correct_count_true_ema,Pan_Balance4100_4110_correct_count_false_ema,Pan_Balance4100_4110_correct_count_accuracy,Leaf_Leader4020_4025_round_max_ema,Leaf_Leader_2030_misses_ema_ema,Leaf_Leader_2030_round_max_ema,Leaf_Leader_2030_duration_ema_ema,Leaf_Leader4020_4025_correct_count_true_ema,Leaf_Leader4020_4025_correct_count_false_ema,Leaf_Leader4020_4025_correct_count_accuracy,Pan_Balance_2030_misses_ema_ema,Pan_Balance_2030_round_max_ema,Pan_Balance_2030_duration_ema_ema,Chest_Sorter__Assessment__4040_duration_ema_ema,Chest_Sorter__Assessment_4020_4025_correct_count_true_ema,Chest_Sorter__Assessment_4020_4025_correct_count_false_ema,Chest_Sorter__Assessment_4020_4025_correct_count_accuracy,Chest_Sorter__Assessment_4100_4110_correct_count_true_ema,Chest_Sorter__Assessment_4100_4110_correct_count_false_ema,Chest_Sorter__Assessment_4100_4110_correct_count_accuracy,Bird_Measurer__Assessment__2030_misses_ema_ema,Bird_Measurer__Assessment__2030_duration_ema_ema,Cart_Balancer__Assessment__4040_duration_ema_ema,Bird_Measurer__Assessment__4040_duration_ema_ema,Cauldron_Filler__Assessment__4040_duration_ema_ema,Chest_Sorter__Assessment__2030_misses_ema_ema,Chest_Sorter__Assessment__2030_duration_ema_ema
0,0006a69f,901acc108f55a5a1,3757,3,3,0,0,0,0,0,0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,647,3,0,57,18,0,20,4,18,0,6,6,0,0,0,4,1,2,79,7,9,77,7,9,4,92,14,31,19,121,0,1,0,0,0,94,0,4,0,0,0,0,0,0,0,0,15.0,0.092273,15.0,4742.656557,15.0,3.0,0.833333,1.5,3.1875,3.75,0.205357,2.75,3.0,25870.125,1,0,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,1,0,1,1,1,0,1,1,0,0,2,0,0.0,44.44,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,91.0,164.0,0.0,0,0,0.0,0,0,0,0.0,0,0.0,0,0,89.0,115.0,0,0,0,0,0,0.0,0,11,3,0,4,0,0,0,0,23,0,0,0,0,0,0,15,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,27,0,3,0,0,0,0,0,0,15,11,0,0,0,0,0,0,0,0,0,0,6,5,26,0,0,0,0,0,0,0,0,0,0,12,0,2,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,42,0,19,4,0,0,0,18,50,0,0,0,21,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,6,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,11,0,0,14,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,4,0,20,0,0,0,0,3,0,15,0,17,0,0,0,0,0,15,0,0,0,1,4,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,14,0,1,0,0,0,0,0,0,0,0,0,0,4,0,0,3,0,3,0,6,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,15,0,0,0,31,0,0,0,0,0,0,0,4,2,12,0.0,69.6525,116.434375,0.0,0.0,9,12,0.529412,4,4,0.363636,0,0,0.0,1,48.363636,29.15561,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0006a69f,77b8ee947eb84b4e,2746,3,0,0,0,0,1,1,0,1.0,1.0,3.0,-1.0,-1.0,-1.0,-1.0,1.0,1143,3,1,14,25,1,26,5,22,1,6,6,1,1,0,4,1,2,226,11,16,223,11,16,6,127,14,31,37,149,0,6,2,0,0,156,0,4,0,5,2,0,0,0,0,0,15.0,0.092273,15.0,4742.656557,15.0,3.0,0.833333,1.5,3.1875,3.75,0.205357,2.75,3.0,25870.125,1,1,3,0,0,0,0,1,0,0,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,0,2,0,193.0,44.44,0,0.0,0.0,0.0,104.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,78.0,0.0,0.0,0.0,91.0,164.0,0.0,0,0,0.0,0,0,0,39.0,0,0.0,0,0,89.0,115.0,0,0,0,0,0,0.0,0,14,4,1,6,0,0,0,15,23,0,0,0,0,1,0,15,0,77,0,0,0,0,0,18,0,0,0,4,0,2,0,1,0,0,0,0,0,0,0,0,0,0,19,0,0,27,0,3,0,0,0,1,0,0,15,14,0,0,3,26,0,0,0,0,0,0,6,5,26,0,0,0,0,0,0,2,0,0,5,12,0,2,6,0,0,0,0,0,0,0,0,14,0,0,0,1,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,2,0,3,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,9,0,0,17,0,42,1,19,4,0,0,0,18,50,0,0,17,21,6,0,2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,77,0,0,0,6,3,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,6,0,0,0,0,0,0,0,1,7,0,0,0,0,0,3,0,7,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,4,0,0,0,11,0,0,14,0,0,0,51,4,0,0,13,4,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,4,0,20,0,0,0,0,3,0,15,0,17,0,30,0,0,0,15,0,0,0,1,4,0,0,0,0,0,2,4,1,0,0,3,0,0,1,0,0,14,0,1,0,0,0,0,0,0,0,0,0,0,4,0,1,3,0,3,1,6,0,2,18,0,0,3,0,0,0,0,0,0,0,0,0,23,2,0,20,0,0,0,0,0,0,0,0,0,52,0,0,0,0,0,1,0,15,15,0,0,8,31,0,0,0,0,0,0,0,4,2,19,0.0,105.118125,110.625188,19.9015,39.0,16,19,0.941176,4,4,0.363636,0,0,0.0,1,60.0,42.735064,318.0,0.0,25592.0,1.0,0.0,1.0,5.0,1.0,0.833333,2.0,0.0,1.0,27160.0,1.0,3.0,0.25,5.0,21.0,0.192308,2.0,0.25,2.0,13656.75,2.0,1.0,0.666667,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0006a69f,6bdf9623adc94d89,3757,3,3,1,0,0,1,1,11,0.083333,0.5,1.5,0.0,-1.0,-1.0,-1.0,1.0,1230,3,2,58,26,1,27,5,22,1,6,6,1,1,0,4,1,2,228,22,16,225,22,16,6,127,14,31,59,171,0,6,2,0,0,160,0,4,0,5,13,0,0,0,0,0,15.0,0.092273,15.0,4742.656557,15.0,3.0,0.833333,1.5,3.1875,3.75,0.205357,2.75,3.0,25870.125,1,1,3,0,1,0,0,1,0,0,0,0,0,2,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,0,2,0,193.0,44.44,0,92.0,0.0,0.0,104.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,78.0,0.0,0.0,0.0,91.0,164.0,0.0,0,0,0.0,0,0,0,39.0,0,0.0,0,0,89.0,115.0,0,0,0,0,0,0.0,0,14,4,2,6,0,0,0,15,23,0,0,0,0,1,0,15,0,77,0,0,0,0,0,18,0,2,0,4,0,2,0,1,0,0,0,0,0,11,0,0,0,0,19,0,0,27,0,3,0,0,0,1,0,0,15,14,0,0,3,26,0,0,0,0,0,0,6,5,26,0,0,0,0,0,0,2,0,0,5,12,0,2,6,0,0,0,0,0,0,0,0,14,0,0,0,1,0,0,0,0,0,2,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,2,22,3,0,0,0,0,0,0,22,0,14,0,0,0,0,0,0,9,0,0,17,0,42,1,19,4,0,0,0,18,50,0,0,17,21,6,0,2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,77,0,0,0,6,3,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,6,0,0,0,0,0,0,0,1,7,0,0,0,0,0,3,0,7,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,4,0,0,0,11,0,0,14,0,0,4,51,4,0,0,13,4,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,11,0,4,0,20,0,0,0,0,3,0,15,0,17,0,30,0,0,2,15,0,0,0,1,4,0,0,0,0,0,2,4,1,0,0,3,0,0,1,0,0,14,0,1,0,0,0,0,0,0,0,0,0,0,4,0,1,3,0,3,1,6,0,2,18,0,0,3,0,11,0,0,0,0,0,0,0,23,2,0,20,0,0,0,0,0,1,0,0,0,52,0,0,0,0,1,1,0,15,15,0,0,8,31,0,0,0,0,0,0,0,4,2,20,0.0,105.118125,110.625188,56.35025,65.5,17,20,1.0,4,4,0.363636,0,0,0.0,1,60.0,42.735064,318.0,0.0,25592.0,1.0,0.0,1.0,5.0,1.0,0.833333,2.0,0.0,1.0,27160.0,1.0,3.0,0.25,5.0,21.0,0.192308,2.0,0.25,2.0,13656.75,2.0,1.0,0.666667,3.0,19.0,0.136364,0.0,11.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0006a69f,9501794defd84e4d,3757,3,2,2,0,0,2,2,11,0.153846,0.5,1.5,0.0,-1.0,-1.0,-1.0,0.0,2159,3,2,9,47,2,52,9,43,5,10,9,2,2,0,8,2,5,341,25,40,336,25,40,10,243,29,45,93,314,6,14,9,2,0,348,0,4,1,6,13,9,0,0,5,5,12.75,0.069205,12.75,5304.365953,12.75,2.25,0.875,1.875,5.390625,2.8125,0.404018,2.0625,3.0,23329.390625,2,1,4,0,1,1,1,1,0,0,0,0,0,3,0,0,0,1,0,1,0,2,2,0,0,0,0,0,2,1,3,2,0,2,1,2,2,2,1,2,2,1,1,3,0,193.0,48.752,0,92.0,165.0,132.0,104.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,78.0,0.0,110.0,0.0,376.8,149.2,0.0,0,0,0.0,0,0,0,30.72,0,0.0,0,0,78.6,108.0,0,0,0,0,0,80.0,0,24,9,4,10,0,0,0,15,70,0,14,0,0,1,0,21,0,77,0,0,0,0,0,24,9,2,0,4,0,2,8,1,14,0,0,0,2,11,0,6,0,0,33,2,0,39,1,6,0,0,0,2,0,0,22,24,0,0,9,26,0,0,0,0,0,0,9,8,38,0,0,9,0,0,0,2,0,0,5,16,0,2,9,0,0,0,0,0,0,0,0,14,0,2,0,3,0,0,0,0,0,2,3,0,0,1,0,0,0,0,0,0,0,1,2,0,5,4,22,4,0,0,1,5,0,0,22,1,14,0,0,0,0,6,0,9,0,0,26,2,46,1,38,7,0,1,0,24,83,0,0,17,33,9,0,2,0,0,0,0,0,1,15,0,0,0,0,37,2,0,2,0,6,0,0,2,0,5,77,0,0,0,9,3,6,0,0,0,0,2,0,5,8,0,0,0,0,3,0,0,11,0,0,6,0,0,0,0,3,11,0,7,0,0,0,3,0,76,0,2,1,0,0,1,0,0,1,2,0,0,0,2,21,0,0,0,0,0,0,1,1,2,0,1,0,0,11,0,0,0,15,0,0,21,38,0,4,51,9,7,0,21,9,0,2,0,5,0,0,0,0,0,0,0,0,3,0,0,11,0,4,0,34,0,0,0,8,4,0,47,28,26,0,30,0,5,2,47,0,0,0,2,4,0,0,2,1,1,2,11,1,0,4,6,0,0,1,0,0,23,0,1,2,0,0,1,0,9,0,0,0,7,4,0,1,3,0,6,3,10,0,2,37,15,0,3,0,11,0,0,0,1,0,0,1,57,2,0,20,0,7,0,0,0,1,0,1,0,52,0,0,8,0,1,1,0,23,22,3,0,11,45,0,0,0,0,0,1,0,13,3,31,0.0,91.939695,455.521975,25.188813,41.25,17,31,1.0,9,13,0.818182,0,0,0.0,1,54.75,38.46454,318.0,0.0,23225.5,1.0,0.0,1.0,4.5,0.75,0.875,2.0,0.0,1.0,27160.0,1.0,3.0,0.25,5.0,21.0,0.192308,2.0,0.25,2.0,13656.75,2.0,1.0,0.666667,3.0,19.0,0.136364,0.0,11.0,0.0,2.0,0.75,2.0,36114.25,5.0,4.0,0.555556,2.0,1.0,2.0,43665.0,0.0,2.0,0.0,7.0,7.0,22985.830566,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0006a69f,a9ef3ecb3d1acc6a,2746,3,3,2,0,1,2,3,12,0.2,0.5,1.6,0.0,-1.0,-1.0,-1.0,0.5,2586,3,1,11,56,3,64,10,53,6,10,9,3,2,1,8,2,5,463,30,53,457,30,53,12,277,29,45,105,331,6,15,10,2,0,387,0,4,1,12,13,9,0,0,5,5,12.75,0.069205,12.75,5304.365953,12.75,2.25,0.875,1.875,5.390625,2.8125,0.404018,2.0625,3.0,23329.390625,2,2,4,0,1,1,1,2,0,0,0,0,0,3,0,0,0,3,0,1,0,2,2,0,0,0,0,0,2,1,4,2,0,2,2,2,2,2,3,2,2,2,1,3,0,183.0,48.752,0,92.0,165.0,132.0,99.2,0.0,0.0,0.0,0.0,0.0,0,0,0,0,96.32,0.0,110.0,0.0,376.8,149.2,0.0,0,0,0.0,0,0,0,30.776,0,0.0,0,0,78.6,108.0,0,0,0,0,0,80.0,0,28,10,5,13,0,0,0,24,70,0,14,0,0,2,0,21,0,121,0,0,0,0,0,24,9,2,0,8,0,5,8,2,14,0,1,0,2,11,0,6,0,0,33,2,0,39,1,6,0,0,0,4,0,0,22,28,0,0,12,45,0,0,0,0,0,0,9,8,38,0,0,9,0,0,0,9,0,0,6,16,0,2,9,0,0,0,0,0,0,0,0,60,0,2,0,4,0,0,0,0,0,9,4,0,0,3,0,0,0,0,0,0,0,3,2,0,5,4,22,4,0,0,1,5,0,0,22,1,60,0,0,0,0,6,0,15,0,0,26,2,46,3,38,7,0,1,0,24,83,0,0,31,33,15,0,2,0,0,0,0,0,2,15,0,0,0,0,37,3,0,2,0,6,0,0,2,0,5,121,0,0,0,9,5,6,0,0,0,0,2,0,5,8,0,0,0,0,11,0,0,12,0,0,6,0,0,0,0,4,11,0,7,0,0,0,12,0,76,1,2,1,0,0,3,0,0,1,2,0,0,0,2,21,0,0,0,0,0,0,1,1,2,0,3,0,0,15,0,0,0,15,0,0,21,38,0,4,79,12,7,0,21,12,0,3,0,5,0,0,0,0,0,0,0,0,3,0,0,11,0,4,0,34,0,0,0,8,4,0,47,28,26,0,48,0,5,2,47,0,0,0,2,4,0,0,2,1,1,3,15,2,0,4,6,0,0,2,0,0,23,0,1,2,0,0,1,1,9,0,0,0,7,4,0,1,3,0,6,4,10,0,2,37,15,0,5,0,11,0,0,0,1,0,0,1,57,9,0,30,0,7,0,0,0,1,0,1,0,81,0,0,8,0,1,3,0,23,22,3,0,18,45,0,0,0,0,0,1,0,13,3,40,0.0,145.601087,268.210487,28.515906,39.2,17,40,1.0,9,13,0.818182,0,0,0.0,1,63.5,46.202582,315.75,0.25,23021.375,1.0,0.25,0.875,4.375,1.0625,0.822917,2.25,0.0,1.25,29062.0625,1.25,2.75,0.3125,7.0,17.25,0.315283,3.25,0.285156,3.25,16587.296692,3.25,1.25,0.694444,3.0,19.0,0.136364,0.0,11.0,0.0,2.0,0.75,2.0,36114.25,5.0,4.0,0.555556,2.0,1.0,2.0,43665.0,0.0,2.0,0.0,7.0,7.0,22985.830566,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [30]:
reduce_train=pd.merge(reduce_train_true,feature_agg,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_train=pd.merge(reduce_train,feature_agg_Activity,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_test=pd.merge(reduce_test_true,feature_agg,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_test=pd.merge(reduce_test,feature_agg_Activity,left_on=["session_id","installation_id"],right_on=["game_session","installation_id"],how="left")
reduce_test=reduce_test.drop(['game_session_y','game_session_x'],axis=1)

reduce_train.to_csv('reduce_train.csv', index=False)
reduce_test.to_csv('reduce_test.csv', index=False)

reduce_train=reduce_train[reduce_test.columns]
ajusted_test=reduce_test.copy()
to_exclude=[]
for feature in ajusted_test.columns:
    if feature not in ['installation_id','session_id','accuracy_group','session_title','session_world','last_world','last_activity_type','same_world_with_last']+\
                    ["give_up_"+assess for assess in assess_titles]+['acc_' + title for title in assess_titles]:
        data = reduce_train[feature]
        train_mean = data.mean()
        data = ajusted_test[feature] 
        test_mean = data.mean()
        try:
            ajust_factor = train_mean / test_mean
            if ajust_factor > 5 or ajust_factor < 0.2:
                to_exclude.append(feature)
                print(feature, train_mean, test_mean)
            else:
                ajusted_test[feature] *= ajust_factor
        except:
            to_exclude.append(feature)
            print(feature, train_mean, test_mean)


acc_Cart_Balancer__Assessment_ -0.04020325710970116 -0.47065833333333346
acc_Cauldron_Filler__Assessment_ -0.08582722503299132 -0.48076626984127
ema_duration_12_Monkeys 0.0 0.0
ema_duration_Balancing_Act 0.0 0.0
ema_duration_Chest_Sorter__Assessment_ 168.1069584672345 22.883051567104
ema_duration_Costume_Box 0.0 0.0
ema_duration_Crystal_Caves___Level_1 0.0 0.0
ema_duration_Crystal_Caves___Level_2 0.0 0.0
ema_duration_Crystal_Caves___Level_3 0.0 0.0
ema_duration_Heavy__Heavier__Heaviest 0.0 0.0
ema_duration_Honey_Cake 0.0 0.0
ema_duration_Lifting_Heavy_Things 0.0 0.0
ema_duration_Magma_Peak___Level_1 0.0 0.0
ema_duration_Magma_Peak___Level_2 0.0 0.0
ema_duration_Ordering_Spheres 0.0 0.0
ema_duration_Pirate_s_Tale 0.0 0.0
ema_duration_Rulers 0.0 0.0
ema_duration_Slop_Problem 0.0 0.0
ema_duration_Treasure_Map 0.0 0.0
ema_duration_Tree_Top_City___Level_1 0.0 0.0
ema_duration_Tree_Top_City___Level_2 0.0 0.0
ema_duration_Tree_Top_City___Level_3 0.0 0.0
ema_duration_Welcome_to_Lost_Lagoon_ 0.

In [31]:
@jit
def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    y_pred[y_pred <= regression_thresholds[0]] = 0
    y_pred[np.where(np.logical_and(y_pred >regression_thresholds[0], y_pred <= regression_thresholds[1]))] = 1
    y_pred[np.where(np.logical_and(y_pred > regression_thresholds[1], y_pred <= regression_thresholds[2]))] = 2
    y_pred[y_pred > regression_thresholds[2]] = 3

    # y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True

class LGBWrapper_regr(object):
    """
    A wrapper for lightgbm model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = lgb.LGBMRegressor()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):
        if params['objective'] == 'regression':
            eval_metric = eval_qwk_lgb_regr
        else:
            eval_metric = 'auc'

        eval_set = [(X_train, y_train)]
        eval_names = ['train']
        self.model = self.model.set_params(**params)

        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
            eval_names.append('valid')

        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))
            eval_names.append('holdout')

        if 'cat_cols' in params.keys():
            cat_cols = [col for col in params['cat_cols'] if col in X_train.columns]
            if len(cat_cols) > 0:
                categorical_columns = params['cat_cols']
            else:
                categorical_columns = 'auto'
        else:
            categorical_columns = 'auto'
        #print(categorical_columns)
        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_names=eval_names, eval_metric=eval_metric,
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'],
                       categorical_feature=categorical_columns)

        self.best_score_ = self.model.best_score_
        self.feature_importances_ = self.model.feature_importances_

    def predict(self, X_test):
        return self.model.predict(X_test, num_iteration=self.model.best_iteration_)
    
    
class RegressorModel(object):
    """
    A wrapper class for classification models.
    It can be used for training and prediction.
    Can plot feature importance and training progress (if relevant for model).
    """

    def __init__(self, columns: list = None, model_wrapper=None,truncate_valid=False,seed=66):
        """
        :param original_columns:
        :param model_wrapper:
        """
        self.columns = columns
        self.model_wrapper = model_wrapper
        self.result_dict = {}
        self.train_one_fold = False
        self.preprocesser = None
        self.truncate_valid=truncate_valid
        self.truncate_seed=seed

    def fit(self, X: pd.DataFrame, y,
            X_holdout: pd.DataFrame = None, y_holdout=None,
            folds=None,
            params: dict = None,
            eval_metric='rmse',
            cols_to_drop: list = None,
            preprocesser=None,
            transformers: dict = None,
            adversarial: bool = False,
            plot: bool = True):
        """
        Training the model.

        :param X: training data
        :param y: training target
        :param X_holdout: holdout data
        :param y_holdout: holdout target
        :param folds: folds to split the data. If not defined, then model will be trained on the whole X
        :param params: training parameters
        :param eval_metric: metric for validataion
        :param cols_to_drop: list of columns to drop (for example ID)
        :param preprocesser: preprocesser class
        :param transformers: transformer to use on folds
        :param adversarial
        :return:
        """

        if folds is None:
            folds = KFold(n_splits=3, random_state=42)
            self.train_one_fold = True

        self.columns = X.columns if self.columns is None else self.columns
        self.feature_importances = pd.DataFrame(columns=['feature', 'importance'])
        self.trained_transformers = {k: [] for k in transformers}
        self.transformers = transformers
        self.models = []
        self.folds_dict = {}
        self.eval_metric = eval_metric
        n_target = 1
        self.oof = []
        self.n_target = n_target
        random.seed(self.truncate_seed)
        X = X[self.columns]
        if X_holdout is not None:
            X_holdout = X_holdout[self.columns]

        if preprocesser is not None:
            self.preprocesser = preprocesser
            self.preprocesser.fit(X, y)
            X = self.preprocesser.transform(X, y)
            self.columns = X.columns.tolist()
            if X_holdout is not None:
                X_holdout = self.preprocesser.transform(X_holdout)

        for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y, X['installation_id'])):

            if X_holdout is not None:
                X_hold = X_holdout.copy()
            else:
                X_hold = None
            self.folds_dict[fold_n] = {}
            if params['verbose']:
                print(f'Fold {fold_n + 1} started at {time.ctime()}')
            self.folds_dict[fold_n] = {}

            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            
            if self.truncate_valid:
                
                truncated_index=[]
                for iid in sorted(list(set(X_valid['installation_id']))):
                    list_ = list(X_valid.loc[X_valid['installation_id'] == iid].index)
                    cur = random.choices(list_, k=1)[0]
                    truncated_index.append(cur)
                X_valid=X_valid.loc[truncated_index]
                y_valid=y_valid.loc[truncated_index]  
            
            if self.train_one_fold:
                X_train = X[self.original_columns]
                y_train = y
                X_valid = None
                y_valid = None

            datasets = {'X_train': X_train, 'X_valid': X_valid, 'X_holdout': X_hold, 'y_train': y_train}
            X_train, X_valid, X_hold = self.transform_(datasets, cols_to_drop)

            self.folds_dict[fold_n]['columns'] = X_train.columns.tolist()

            model = copy.deepcopy(self.model_wrapper)

            if adversarial:
                X_new1 = X_train.copy()
                if X_valid is not None:
                    X_new2 = X_valid.copy()
                elif X_holdout is not None:
                    X_new2 = X_holdout.copy()
                X_new = pd.concat([X_new1, X_new2], axis=0)
                y_new = np.hstack((np.zeros((X_new1.shape[0])), np.ones((X_new2.shape[0]))))
                X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new)

            model.fit(X_train, y_train, X_valid, y_valid, X_hold, y_holdout, params=params)

            self.folds_dict[fold_n]['scores'] = model.best_score_

            if not adversarial:
                self.oof.append([model.predict(X_valid).reshape(-1),y_valid.values])

            fold_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)),
                                           columns=['feature', 'importance'])
            self.feature_importances = self.feature_importances.append(fold_importance)
            self.models.append(model)

        self.feature_importances['importance'] = self.feature_importances['importance'].astype(int)

        # if params['verbose']:
        self.calc_scores_()

        if plot:
            # print(classification_report(y, self.oof.argmax(1)))
            fig, ax = plt.subplots(figsize=(16, 12))
            plt.subplot(2, 2, 1)
            self.plot_feature_importance(top_n=20)
            plt.subplot(2, 2, 2)
            self.plot_metric()
            if not self.truncate_valid:
                plt.subplot(2, 2, 3)
                plt.hist(y.values.reshape(-1, 1) - self.oof)
                plt.title('Distribution of errors')
                plt.subplot(2, 2, 4)
                plt.hist(self.oof)
                plt.title('Distribution of oof predictions')

    def transform_(self, datasets, cols_to_drop):
        for name, transformer in self.transformers.items():
            transformer.fit(datasets['X_train'], datasets['y_train'])
            datasets['X_train'] = transformer.transform(datasets['X_train'])
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = transformer.transform(datasets['X_valid'])
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = transformer.transform(datasets['X_holdout'])
            self.trained_transformers[name].append(transformer)
        if cols_to_drop is not None:
            cols_to_drop = [col for col in cols_to_drop if col in datasets['X_train'].columns]

            datasets['X_train'] = datasets['X_train'].drop(cols_to_drop, axis=1)
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = datasets['X_valid'].drop(cols_to_drop, axis=1)
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = datasets['X_holdout'].drop(cols_to_drop, axis=1)
        self.cols_to_drop = cols_to_drop
        print("Dropping",len(set(cols_to_drop)) ,"columns First 10:",cols_to_drop[:10])
        return datasets['X_train'], datasets['X_valid'], datasets['X_holdout']

    def calc_scores_(self):
        print()
        datasets = [k for k, v in [v['scores'] for k, v in self.folds_dict.items()][0].items() if len(v) > 0]
        self.scores = {}
        for d in datasets:
            scores = [v['scores'][d][self.eval_metric] for k, v in self.folds_dict.items()]
            print(f"CV mean score on {d}: {np.mean(scores):.4f} +/- {np.std(scores):.4f} std.")
            self.scores[d] = np.mean(scores)

    def predict(self, X_test, averaging: str = 'usual'):
        """
        Make prediction

        :param X_test:
        :param averaging: method of averaging
        :return:
        """
        full_prediction = np.zeros((X_test.shape[0],1))
        if self.preprocesser is not None:
            X_test = self.preprocesser.transform(X_test)
        for i in range(len(self.models)):
            X_t = X_test.copy()
            for name, transformers in self.trained_transformers.items():
                X_t = transformers[i].transform(X_t)

            if self.cols_to_drop is not None:
                cols_to_drop = [col for col in self.cols_to_drop if col in X_t.columns]
                X_t = X_t.drop(cols_to_drop, axis=1)
            y_pred = self.models[i].predict(X_t[self.folds_dict[i]['columns']]).reshape(-1, full_prediction.shape[1])

            # if case transformation changes the number of the rows
            if full_prediction.shape[0] != len(y_pred):
                full_prediction = np.zeros((y_pred.shape[0], 1))

            if averaging == 'usual':
                full_prediction += y_pred
            elif averaging == 'rank':
                full_prediction += pd.Series(y_pred).rank().values

        return full_prediction / len(self.models)

    def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10):
        """
        Plot default feature importance.

        :param drop_null_importance: drop columns with null feature importance
        :param top_n: show top n columns
        :return:
        """

        top_feats = self.get_top_features(drop_null_importance, top_n)
        feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)]
        feature_importances['feature'] = feature_importances['feature'].astype(str)
        top_feats = [str(i) for i in top_feats]
        sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats)
        plt.title('Feature importances')

    def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10):
        """
        Get top features by importance.

        :param drop_null_importance:
        :param top_n:
        :return:
        """
        grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean()
        if drop_null_importance:
            grouped_feats = grouped_feats[grouped_feats != 0]
        return list(grouped_feats.sort_values(ascending=False).index)[:top_n]

    def plot_metric(self):
        """
        Plot training progress.
        Inspired by `plot_metric` from https://lightgbm.readthedocs.io/en/latest/_modules/lightgbm/plotting.html

        :return:
        """
        full_evals_results = pd.DataFrame()
        for model in self.models:
            evals_result = pd.DataFrame()
            for k in model.model.evals_result_.keys():
                evals_result[k] = model.model.evals_result_[k][self.eval_metric]
            evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
            full_evals_results = full_evals_results.append(evals_result)

        full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                            'variable': 'dataset'})
        sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
        plt.title('Training progress')


In [32]:
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -qwk(y, X_p)

    def fit(self, X, y,initial_coef):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [33]:
del_cols = []
for col in reduce_train.columns.values:
    if len(reduce_train[col].value_counts())==0:
        del_cols.append(col)
        continue
    counts = reduce_train[col].value_counts().iloc[0]
    if (counts / reduce_train.shape[0]) >= 0.9:
        del_cols.append(col)
print(str(len(del_cols)) + " features removed!")
del_cols

70 features removed!


['4050',
 '4080',
 'ema_duration_12_Monkeys',
 'ema_duration_Balancing_Act',
 'ema_duration_Costume_Box',
 'ema_duration_Crystal_Caves___Level_1',
 'ema_duration_Crystal_Caves___Level_2',
 'ema_duration_Crystal_Caves___Level_3',
 'ema_duration_Heavy__Heavier__Heaviest',
 'ema_duration_Honey_Cake',
 'ema_duration_Lifting_Heavy_Things',
 'ema_duration_Magma_Peak___Level_1',
 'ema_duration_Magma_Peak___Level_2',
 'ema_duration_Ordering_Spheres',
 'ema_duration_Pirate_s_Tale',
 'ema_duration_Rulers',
 'ema_duration_Slop_Problem',
 'ema_duration_Treasure_Map',
 'ema_duration_Tree_Top_City___Level_1',
 'ema_duration_Tree_Top_City___Level_2',
 'ema_duration_Tree_Top_City___Level_3',
 'ema_duration_Welcome_to_Lost_Lagoon_',
 '01ca3a3c',
 '05ad839b',
 '08ff79ad',
 '0ce40006',
 '0d18d96c',
 '119b5b02',
 '13f56524',
 '16667cc5',
 '19967db1',
 '1b54d27f',
 '26a5a3dd',
 '29a42aea',
 '2ec694de',
 '30df3273',
 '3a4be871',
 '3b2048ee',
 '47efca07',
 '47f43a44',
 '4e5fc6f5',
 '5f5b2617',
 '6043a2b4',
 

In [34]:
counter = 0
to_remove = []
neglect_feat=['installation_id','session_id','accuracy_group']

features=[x for x in reduce_train.columns if x not in neglect_feat]
for feat_a in features:
    for feat_b in features:
        if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
            c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
            if c > 0.995:
                counter += 1
                to_remove.append(feat_b)
                print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))

  c /= stddev[:, None]
  c /= stddev[None, :]


1: FEAT_A: 2020 FEAT_B: 2030 - Correlation: 0.9959933262816534
2: FEAT_A: 2040 FEAT_B: 2050 - Correlation: 0.9965259434878118
3: FEAT_A: 2040 FEAT_B: 2b9272f4 - Correlation: 0.9964451607954699
4: FEAT_A: 2040 FEAT_B: 37c53127 - Correlation: 0.9965259434878118
5: FEAT_A: 2040 FEAT_B: 5a848010 - Correlation: 0.9976195852057889
6: FEAT_A: 2040 FEAT_B: 73757a5e - Correlation: 0.9966452709971663
7: FEAT_A: 2040 FEAT_B: dcaede90 - Correlation: 0.9999999999999998
8: FEAT_A: 3010 FEAT_B: 3110 - Correlation: 0.9999293402893734
9: FEAT_A: 3020 FEAT_B: 3120 - Correlation: 0.9998761417908971
10: FEAT_A: 3021 FEAT_B: 3121 - Correlation: 0.9999098200487934
11: FEAT_A: 4031 FEAT_B: 1996c610 - Correlation: 1.0
12: FEAT_A: 4050 FEAT_B: a1192f43 - Correlation: 0.9999999999999999
13: FEAT_A: 4220 FEAT_B: 1340b8d7 - Correlation: 1.0
14: FEAT_A: 4230 FEAT_B: 4235 - Correlation: 0.9999995197498746
15: FEAT_A: 4230 FEAT_B: 85de926c - Correlation: 0.9999995197498746
16: FEAT_A: 4230 FEAT_B: ad148f58 - Correla

In [35]:
list_of_event_code = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in list_of_event_code]
list_of_event_id = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in list_of_event_id]

cols_to_drop = ['session_id', 'installation_id','accuracy_group',
                'installation_session_count',
                'installation_duration_mean',
                'installation_title_nunique',
                'installation_event_code_count_mean',
                "4070",

               ]
cols_to_drop+=del_cols
cols_to_drop+=to_exclude
cols_to_drop+=to_remove
#for cols in same_features.values():
#    cols_to_drop+=cols
print(len(set(cols_to_drop)))
cols_to_drop

217


['session_id',
 'installation_id',
 'accuracy_group',
 'installation_session_count',
 'installation_duration_mean',
 'installation_title_nunique',
 'installation_event_code_count_mean',
 '4070',
 '4050',
 '4080',
 'ema_duration_12_Monkeys',
 'ema_duration_Balancing_Act',
 'ema_duration_Costume_Box',
 'ema_duration_Crystal_Caves___Level_1',
 'ema_duration_Crystal_Caves___Level_2',
 'ema_duration_Crystal_Caves___Level_3',
 'ema_duration_Heavy__Heavier__Heaviest',
 'ema_duration_Honey_Cake',
 'ema_duration_Lifting_Heavy_Things',
 'ema_duration_Magma_Peak___Level_1',
 'ema_duration_Magma_Peak___Level_2',
 'ema_duration_Ordering_Spheres',
 'ema_duration_Pirate_s_Tale',
 'ema_duration_Rulers',
 'ema_duration_Slop_Problem',
 'ema_duration_Treasure_Map',
 'ema_duration_Tree_Top_City___Level_1',
 'ema_duration_Tree_Top_City___Level_2',
 'ema_duration_Tree_Top_City___Level_3',
 'ema_duration_Welcome_to_Lost_Lagoon_',
 '01ca3a3c',
 '05ad839b',
 '08ff79ad',
 '0ce40006',
 '0d18d96c',
 '119b5b02',
 

In [36]:
categoricals=['session_title','session_world','last_world','last_activity_type']

params = {'n_estimators':2000,
          'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'subsample': 0.85,
          'subsample_freq': 1,
          'learning_rate': 0.01,
          'feature_fraction': 0.75,
          'max_depth': 10,
          'num_leaves':31,
          'min_data_in_leaf':50,
          'cat_cols':categoricals,
          'lambda_l1': 2,
          'lambda_l2':9,
          'verbose': 100,
          'early_stopping_rounds': 200,
          'eval_metric': 'cappa',
          'seed':888,
          'n_jobs':8
         }
n_fold=5
folds = GroupKFold(n_splits=n_fold)

In [37]:
#regression_thresholds=np.array([1.18400496,1.65723726,2.13351805])
#regression_thresholds=np.array([1.1,1.7,2.2])
regression_thresholds=np.array([0.5,1.5,2.5])
y = reduce_train['accuracy_group']
regressor_model1 = RegressorModel(model_wrapper=LGBWrapper_regr(),truncate_valid=False,)
regressor_model1.fit(X=reduce_train, y=y, folds=folds, params=params, preprocesser=None, transformers={},
                    eval_metric='cappa', cols_to_drop=cols_to_drop,plot=False)

Fold 1 started at Mon Jan 20 12:21:45 2020
Dropping 213 columns First 10: ['session_id', 'installation_id', 'accuracy_group', '4070', '4050', '4080', 'ema_duration_12_Monkeys', 'ema_duration_Balancing_Act', 'ema_duration_Costume_Box', 'ema_duration_Crystal_Caves___Level_1']


New categorical_feature is ['last_activity_type', 'last_world', 'session_title', 'session_world']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: [1m[1m[1mInvalid use of Function(<function asarray at 0x7fd04076f158>) with argument(s) of type(s): (array(float32, 1d, C), dtype=Function(<class 'int'>))
 * parameterized
[1mIn definition 0:[0m
[1m    AttributeError: 'Function' object has no attribute 'dtype'[0m
    raised from /opt/conda/lib/python3.6/site-packages/numba/targets/arraymath.py:3845
[1mIn definition 1:[0m
[1m    AttributeError: 'Function' object has no attribute 'dtype'[0m
    raised from /opt/conda/lib/python3.6/site-packages/numba/targets/arraymath.py:3845
[1mThis error is usually caused by passing an argument of a type that is unsupported by the named function.[0m[0m
[0m[1m[1] During: resolving callee type: Fun

Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 1.00218	train's cappa: 0.343637	valid's rmse: 1.01504	valid's cappa: 0.334677
[200]	train's rmse: 0.935485	train's cappa: 0.536739	valid's rmse: 0.967183	valid's cappa: 0.515583
[300]	train's rmse: 0.906672	train's cappa: 0.60613	valid's rmse: 0.956353	valid's cappa: 0.566363
[400]	train's rmse: 0.887015	train's cappa: 0.633889	valid's rmse: 0.953261	valid's cappa: 0.582789
[500]	train's rmse: 0.870949	train's cappa: 0.651764	valid's rmse: 0.952401	valid's cappa: 0.586107
[600]	train's rmse: 0.856714	train's cappa: 0.664389	valid's rmse: 0.951582	valid's cappa: 0.589545
[700]	train's rmse: 0.843417	train's cappa: 0.675294	valid's rmse: 0.951091	valid's cappa: 0.5895
[800]	train's rmse: 0.830591	train's cappa: 0.6868	valid's rmse: 0.950998	valid's cappa: 0.588407
Early stopping, best iteration is:
[619]	train's rmse: 0.854096	train's cappa: 0.666723	valid's rmse: 0.951569	valid's cappa: 0.590505
Fold 2 sta

New categorical_feature is ['last_activity_type', 'last_world', 'session_title', 'session_world']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 1.00154	train's cappa: 0.339866	valid's rmse: 1.02163	valid's cappa: 0.32676
[200]	train's rmse: 0.935795	train's cappa: 0.537229	valid's rmse: 0.973178	valid's cappa: 0.497858
[300]	train's rmse: 0.907264	train's cappa: 0.609098	valid's rmse: 0.960893	valid's cappa: 0.560754
[400]	train's rmse: 0.887775	train's cappa: 0.633339	valid's rmse: 0.956662	valid's cappa: 0.575918
[500]	train's rmse: 0.872192	train's cappa: 0.650614	valid's rmse: 0.955233	valid's cappa: 0.578741
[600]	train's rmse: 0.858219	train's cappa: 0.664409	valid's rmse: 0.954382	valid's cappa: 0.57885
[700]	train's rmse: 0.845161	train's cappa: 0.676586	valid's rmse: 0.954002	valid's cappa: 0.57944
[800]	train's rmse: 0.832514	train's cappa: 0.685485	valid's rmse: 0.953625	valid's cappa: 0.58407
[900]	train's rmse: 0.820344	train's cappa: 0.695518	valid's rmse: 0.953047	valid's cappa: 0.585855
[1000]	train's rmse: 0.808869	train's cappa:

In [38]:
oof_predicts=np.concatenate([x[0] for x in regressor_model1.oof],axis=0)
oof_y=np.concatenate([x[1] for x in regressor_model1.oof],axis=0)

#oof_predicts=regressor_model1.oof

coefficients=[0.5,1.5,2.5]

for i in range(8):
    optR = OptimizedRounder()
    optR.fit(oof_predicts.reshape(-1,), oof_y,initial_coef=coefficients)
    coefficients = optR.coefficients()
    oof_rounded=optR.predict(oof_predicts.reshape(-1,),coefficients)
    qwk_score=qwk(oof_y, oof_rounded)
    print("Round",i+1,"    Rounding Coefficients:",coefficients,"QWK score:",qwk_score)

coef1=coefficients
qwkscore1=qwk_score

oof_predicts=np.concatenate([x[0] for x in regressor_model1.oof],axis=0)
oof_y=np.concatenate([x[1] for x in regressor_model1.oof],axis=0)
coefficients=[1.1,1.7,2.2]

for i in range(8):
    optR = OptimizedRounder()
    optR.fit(oof_predicts.reshape(-1,), oof_y,initial_coef=coefficients)
    coefficients = optR.coefficients()
    oof_rounded=optR.predict(oof_predicts.reshape(-1,),coefficients)
    qwk_score=qwk(oof_y, oof_rounded)
    print("Round",i+1,"    Rounding Coefficients:",coefficients,"QWK score:",qwk_score)
coef2=coefficients
qwkscore2=qwk_score
if qwkscore2>qwkscore1:
    print("use coefficient2",qwkscore2)
    coefficients=coef2
else:
    print("use coefficient1",qwkscore1)
    coefficients=coef1

Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-31-105aa9586080> (10)[0m
[1m
File "<ipython-input-31-105aa9586080>", line 10:[0m
[1mdef qwk(a1, a2):
    <source elided>
    """
[1m    max_rat = 3
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "qwk" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-31-105aa9586080>", line 18:[0m
[1mdef qwk(a1, a2):
    <source elided>
    o = 0
[1m    for k in range(a1.shape[0]):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-31-105aa9586080>", line 2:[0m
[1m@jit
[1mdef qwk(a1, a2):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation 

Round 1     Rounding Coefficients: [1.14475624 1.6797524  2.28789235] QWK score: 0.6164926520455429
Round 2     Rounding Coefficients: [1.14715623 1.69521804 2.29265994] QWK score: 0.6167539559394224
Round 3     Rounding Coefficients: [1.1446724  1.69499764 2.2908399 ] QWK score: 0.6169679653618942
Round 4     Rounding Coefficients: [1.1446724  1.69499764 2.2908399 ] QWK score: 0.6169679653618942
Round 5     Rounding Coefficients: [1.1446724  1.69499764 2.2908399 ] QWK score: 0.6169679653618942
Round 6     Rounding Coefficients: [1.1446724  1.69499764 2.2908399 ] QWK score: 0.6169679653618942
Round 7     Rounding Coefficients: [1.1446724  1.69499764 2.2908399 ] QWK score: 0.6169679653618942
Round 8     Rounding Coefficients: [1.1446724  1.69499764 2.2908399 ] QWK score: 0.6169679653618942
Round 1     Rounding Coefficients: [1.09890966 1.69535702 2.29220875] QWK score: 0.6166647474236073
Round 2     Rounding Coefficients: [1.12844537 1.6953839  2.29220501] QWK score: 0.6168654197568993


In [39]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv()
        self.verbose = verbose
        self.params = self.get_params()
        self.y_pred, self.score, self.model = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
    
    def fit(self):
        oof_pred = np.zeros((len(reduce_train), ))
        y_pred = np.zeros((len(reduce_test), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            print('Partial score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, oof_pred[val_idx])[1]))
        _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred)
        if self.verbose:
            print('Our oof cohen kappa score is: ', loss_score)
        return y_pred, loss_score, model

In [40]:
class Xgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set, 
                         num_boost_round=5000, evals=[(train_set, 'train'), (val_set, 'val')], 
                         verbose_eval=verbosity, early_stopping_rounds=100)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):
        return xgb.DMatrix(x)
        
    def get_params(self):
        params = {'colsample_bytree': 0.8,                 
            'learning_rate': 0.01,
            'max_depth': 10,
            'subsample': 1,
            'objective':'reg:squarederror',
            #'eval_metric':'rmse',
            'min_child_weight':3,
            'gamma':0.25,
            'n_estimators':5000}

        return params

In [41]:
xgb_feature=[x for x in reduce_train.columns if x not in cols_to_drop]
xgb_model = Xgb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)

  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:1.8561	val-rmse:1.8567
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.98907	val-rmse:1.12102
[200]	train-rmse:0.715303	val-rmse:0.975667
[300]	train-rmse:0.615843	val-rmse:0.95328
[400]	train-rmse:0.563177	val-rmse:0.951304
Stopping. Best iteration:
[387]	train-rmse:0.569472	val-rmse:0.951253



Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-31-105aa9586080> (10)[0m
[1m
File "<ipython-input-31-105aa9586080>", line 10:[0m
[1mdef qwk(a1, a2):
    <source elided>
    """
[1m    max_rat = 3
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "qwk" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-31-105aa9586080>", line 18:[0m
[1mdef qwk(a1, a2):
    <source elided>
    o = 0
[1m    for k in range(a1.shape[0]):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-31-105aa9586080>", line 2:[0m
[1m@jit
[1mdef qwk(a1, a2):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation 

Partial score of fold 0 is: 0.5902233773194203


  if getattr(data, 'base', None) is not None and \


[0]	train-rmse:1.8559	val-rmse:1.85725
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.989744	val-rmse:1.14047
[200]	train-rmse:0.719862	val-rmse:0.992962
[300]	train-rmse:0.616853	val-rmse:0.967628
[400]	train-rmse:0.559282	val-rmse:0.963674
[500]	train-rmse:0.527373	val-rmse:0.963327
[600]	train-rmse:0.509667	val-rmse:0.963804
Stopping. Best iteration:
[507]	train-rmse:0.526309	val-rmse:0.963256

Partial score of fold 1 is: 0.5813005434210168
[0]	train-rmse:1.85603	val-rmse:1.85708
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.989283	val-rmse:1.12669
[200]	train-rmse:0.7091	val-rmse:0.98226
[300]	train-rmse:0.610089	val-rmse:0.960126
[400]	train-rmse:0.560859	val-rmse:0.957702
Stopping. Best iteration:
[399]	train-rmse:0.561137	val-rmse:0.957638

Partial score

In [42]:
from catboost import CatBoostRegressor
class Catb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostRegressor(**self.params)
        clf.fit(train_set['X'], 
                train_set['y'], 
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity, 
                cat_features=self.categoricals
               )
        return clf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        params = {'loss_function': 'RMSE',
                   'task_type': "CPU",
                   'iterations': 5000,
                   'od_type': "Iter",
                    'depth': 10,
                  'colsample_bylevel': 0.5, 
                   'early_stopping_rounds': 100,
                    'l2_leaf_reg': 18,
                   'random_seed': 42,
                    'use_best_model': True
                    }
        return params

In [43]:
ctb_feature=[x for x in reduce_train.columns if x not in cols_to_drop]
ctb_model = Catb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)

0:	learn: 1.2451448	test: 1.2452686	best: 1.2452686 (0)	total: 502ms	remaining: 41m 50s
100:	learn: 0.9523365	test: 0.9760368	best: 0.9760368 (100)	total: 32.7s	remaining: 26m 25s
200:	learn: 0.9118511	test: 0.9640240	best: 0.9640240 (200)	total: 1m 4s	remaining: 25m 40s
300:	learn: 0.8848401	test: 0.9619214	best: 0.9618960 (292)	total: 1m 36s	remaining: 25m 13s
400:	learn: 0.8656456	test: 0.9611888	best: 0.9611888 (400)	total: 2m 9s	remaining: 24m 49s
500:	learn: 0.8489486	test: 0.9609421	best: 0.9607260 (465)	total: 2m 41s	remaining: 24m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9607259953
bestIteration = 465

Shrink model to first 466 iterations.
Partial score of fold 0 is: 0.5615312692699004
0:	learn: 1.2459186	test: 1.2462498	best: 1.2462498 (0)	total: 331ms	remaining: 27m 35s
100:	learn: 0.9503959	test: 0.9794087	best: 0.9794087 (100)	total: 32.6s	remaining: 26m 19s
200:	learn: 0.9110014	test: 0.9717703	best: 0.9717636 (199)	total: 1m 3s	remaining: 

In [44]:
final_pred = 0.7*regressor_model1.predict(ajusted_test)+0.2*xgb_model.y_pred.reshape(-1,1)+0.1*ctb_model.y_pred.reshape(-1,1)
dist = Counter(reduce_train['accuracy_group'])
#dist = Counter(oof_y)
for k in dist:
    dist[k] /= len(reduce_train['accuracy_group'])
acum = 0
bound = np.zeros(3).astype(np.float)
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(final_pred, acum * 100)
    
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3

final_pred = np.array(list(map(classify, final_pred)))
sample_submission['accuracy_group'] = final_pred.astype(int)
sample_submission.to_csv('./submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)

[1.29346653 1.68431089 1.9756197 ]


3    0.500
0    0.239
1    0.136
2    0.125
Name: accuracy_group, dtype: float64

In [45]:
oof_predicts=np.concatenate([x[0] for x in regressor_model1.oof],axis=0)
oof_y=np.concatenate([x[1] for x in regressor_model1.oof],axis=0)
oof_final_pred=np.array(list(map(classify, oof_predicts)))
qwk(oof_final_pred,oof_y)

Compilation is falling back to object mode WITH looplifting enabled because Function "qwk" failed type inference due to: [1m[1m[1mInvalid use of Function(<function asarray at 0x7fd04076f158>) with argument(s) of type(s): (array(int64, 1d, C), dtype=Function(<class 'int'>))
 * parameterized
[1mIn definition 0:[0m
[1m    AttributeError: 'Function' object has no attribute 'dtype'[0m
    raised from /opt/conda/lib/python3.6/site-packages/numba/targets/arraymath.py:3845
[1mIn definition 1:[0m
[1m    AttributeError: 'Function' object has no attribute 'dtype'[0m
    raised from /opt/conda/lib/python3.6/site-packages/numba/targets/arraymath.py:3845
[1mThis error is usually caused by passing an argument of a type that is unsupported by the named function.[0m[0m
[0m[1m[1] During: resolving callee type: Function(<function asarray at 0x7fd04076f158>)[0m
[0m[1m[2] During: typing of call at <ipython-input-31-105aa9586080> (11)
[0m
[1m
File "<ipython-input-31-105aa9586080>", line

0.6021425917763491