# 01 - TSFresh and Data Analysis

#### Imports

In [1]:
import numpy as np
import pandas as pd
import json 
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.externals import joblib
import pickle
import os.path
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA, IncrementalPCA, SparsePCA, TruncatedSVD
import scipy.sparse
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
import catboost
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

sns.set(style="white")

#### Constants

In [2]:
nrows = None
rare_key_threshold = 10
nfolds = 100

In [3]:
data_folder = 'data/'
train_folds_folder = "folds_train/"
test_folds_folder = "folds_test/"
model_folder = "models/"

target_name = 'target'
test_index_name = 'tst_index'

In [4]:
target_fn = model_folder +'target.pkl'

label_encoder_fn = model_folder + "label_encoder.pkl"
weight_multiplier_fn = model_folder + "weight_multiplier.pkl"
json_metas_fn = model_folder + "json_metas.pkl"

In [5]:
train_indexes_fn = model_folder + "train_indexes.pkl"
test_indexes_fn = model_folder + "test_indexes.pkl"

In [6]:
from datetime import datetime
today = datetime.today()

#### Functions

In [7]:
def stayOnlyKeys(keys, dictionary):
    for key in list(dictionary.keys()):
        if key not in keys:
            del dictionary[key]
    return dictionary

In [8]:
def LoadTarget():
    dft = pd.read_csv(data_folder+'mlboot_train_answers.tsv',
                      delimiter='\t',
                      encoding='utf-8',
                      header=0)
    dft.columns = ['cuid','target']
    dft = dft.set_index('cuid')
    
    return dft

In [9]:
def LoadTest():
    dftst = pd.read_csv(data_folder+'mlboot_test.tsv',
                          delimiter='\t',
                          encoding='utf-8',
                          header=0)
    dftst[test_index_name] = dftst.index+2
    dftst = dftst.set_index('cuid')
    
    return dftst

In [10]:
def LoadData(nrows = None):
    df = pd.read_csv(data_folder+'mlboot_data.tsv',
                delimiter='\t',
                encoding='utf-8',
                nrows=nrows,
                header=None,
                names=['cuid','cat_feature', 'json1', 'json2', 'json3', 'dt_diff'],
                index_col='cuid'
                )
    
    return df

In [11]:
def LoadRawDataSet(nrows):
    df = LoadData(nrows)
    dftst = LoadTest()
    dft = LoadTarget()
    
    df = df.join(dft)
    df = df.join(dftst)
    
    return df

In [12]:
def GetTrainDataSet(df):
    print('Rows in input:',len(df.index))
    print('Rows in output:',len(df[df[target_name].notnull()].index))
    
    return df[df[target_name].notnull()]

In [13]:
def GetTestDataSet(df):
    print('Rows in input:',len(df.index))
    print('Rows in output:',len(df[df[test_index_name].notnull()].index))
    
    return df[df[test_index_name].notnull()]

In [14]:
def Save(obj, filename):
    joblib.dump(obj, filename)

In [15]:
def Load(filename):
    if os.path.isfile(filename):
        return joblib.load(filename)

# Loading data

## From tsv files

In [16]:
df = LoadRawDataSet(nrows)

In [17]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
encoder.fit(df.index.unique())

Save(encoder, label_encoder_fn )

In [18]:
df["id"] = encoder.transform(df.index)
df.index = df["id"]

In [19]:
%%time
from tsfresh.feature_extraction import extract_features, EfficientFCParameters, ComprehensiveFCParameters, MinimalFCParameters
settings = MinimalFCParameters()

extracted_features = extract_features(df,
                 column_id="id",
#                  column_sort="dt_diff",
                 column_kind=None,
                 column_value="cat_feature",
                 default_fc_parameters=settings)

  from pandas.core import datetools
Feature Extraction: 100%|██████████| 20/20 [00:49<00:00,  2.49s/it]


CPU times: user 1min 35s, sys: 6.48 s, total: 1min 41s
Wall time: 1min 46s


In [20]:
df = df.join(extracted_features)

In [21]:
dft = GetTrainDataSet(df)
dft = dft.drop([test_index_name], axis=1)

Rows in input: 19528597
Rows in output: 12874345


In [22]:
dftst = GetTestDataSet(df)
dftst = dftst.drop([target_name], axis=1)

Rows in input: 19528597
Rows in output: 6654252


In [23]:
target_df = dft.groupby("id", as_index=True).agg({
         'target':['max']
    })
target = target_df.values
print("Target size:",len(target))

Save(target, target_fn)

Defaulting to column, but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.


Target size: 427994


In [24]:
classes,counts = np.unique(target, return_counts=True)
weight_multiplier = counts[0]/counts[1]
print('Weight multiplier for imbalanced dataset:',weight_multiplier)

Save(weight_multiplier, weight_multiplier_fn)

Weight multiplier for imbalanced dataset: 18.951239977624464


## Json counters research

Нужно, чтобы ключи счётчиков в трейне и тесте пересекались.
Не имеет смысла тренировать модель на тех счётчкиках, которые в тестовых данных не встречаются.
Аналогично с тестовыми данными - из них нужно убрать те счётчики, которые не встречаются в трейне, так как они всё равно не помогут модели

In [25]:
def analyze_keys(text, meta, dataset_type="train"):
    dictionary = json.loads(text)
    keys = dictionary.keys()
    
    meta[dataset_type].update(keys)
    
    for key in keys:
        if key in meta["key_counter"]:
            meta["key_counter"][key] = meta["key_counter"][key]+1
        else:
            meta["key_counter"][key] = 1
        
        if key in meta["valued_key_counter"] and dictionary[key] is not None and dictionary[key] > 0:
                meta["valued_key_counter"][key] = meta["valued_key_counter"][key]+1
        if key not in meta["valued_key_counter"] and dictionary[key] is not None and dictionary[key] > 0:
                meta["valued_key_counter"][key] = 1
            
            
def GetNonTrashKeys(dic):
    global rare_key_threshold
    
    non_trash_keys_dict = dict( (key, value) for (key, value) in dic.items() if value > rare_key_threshold )
    non_trash_keys = set(list(non_trash_keys_dict.keys()))
    
    return non_trash_keys

In [26]:
def jobs_manager():
    from IPython.lib.backgroundjobs import BackgroundJobManager
    from IPython.core.magic import register_line_magic
    from IPython import get_ipython

    jobs = BackgroundJobManager()

    @register_line_magic
    def job(line):
        ip = get_ipython()
        jobs.new(line, ip.user_global_ns)

    return jobs

def get_chunks(sequence, count):
    count = min(count, len(sequence))
    chunks = [[] for _ in range(count)]
    for index, item in enumerate(sequence):
        chunks[index % count].append(item) 
    return chunks

def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

jobs = jobs_manager()

In [27]:
def kill_thread(thread):
    import ctypes
    
    id = thread.ident
    code = ctypes.pythonapi.PyThreadState_SetAsyncExc(
        ctypes.c_long(id),
        ctypes.py_object(SystemError)
    )
    if code == 0:
        raise ValueError('invalid thread id')
    elif code != 1:
        ctypes.pythonapi.PyThreadState_SetAsyncExc(
            ctypes.c_long(id),
            ctypes.c_long(0)
        )
        raise SystemError('PyThreadState_SetAsyncExc failed')

In [28]:
json_metas = []

def build_json_meta(i):
    global json_metas
    global dft
    global dftst
    
    meta = {
        "train":set([]),
        "test":set([]),
        "total":set([]),
        "cross":set([]),
        "key_counter":{},
        "valued_key_counter":{},
        "non_trash":set([]),
        "important_keys":set([]),
        "i":i,
        "column":"json"+str(i)
    }


    print("Counting keys:",i)
    dft[meta["column"]].apply(lambda text: analyze_keys(text, meta, "train"))
    dftst[meta["column"]].apply(lambda text: analyze_keys(text, meta, "test"))
    
    meta["total"] = meta["train"].copy()
    meta["total"].update(meta["test"])
    
    meta["cross"] = meta["train"] & meta["test"]

    meta["non_trash"] = GetNonTrashKeys(meta["key_counter"])
    meta["important_keys"] = meta["non_trash"] & meta["cross"]

    print("DictVectorizer fiting:",i)
    dict_vector = dict.fromkeys(meta["important_keys"], 1)

    vectorizer = DictVectorizer(sparse=True).fit([dict_vector])

    meta["vectorizer"] = vectorizer
    
    json_metas.append(meta)

In [29]:
%%time
for chunk in get_chunks(range(1,4), 3):
    %job [build_json_meta(index) for index in log_progress(chunk, every=1)]    
    
for thread in jobs.running:
    thread.join()

Starting job # 0 in a separate thread.
Starting job # 2 in a separate thread.
Starting job # 3 in a separate thread.


Counting keys: 2
Counting keys: 3
Counting keys: 1
DictVectorizer fiting: 3
DictVectorizer fiting: 1
DictVectorizer fiting: 2
CPU times: user 14min 50s, sys: 2.22 s, total: 14min 53s
Wall time: 14min 51s


In [30]:
for i in range(0,3):
    json_metas[i]["valued_non_trash"] = GetNonTrashKeys(json_metas[i]["valued_key_counter"])

In [31]:
for i in range(0,3):
    print("non_trash",len(json_metas[i]["non_trash"]))
    print("valued_non_trash",len(json_metas[i]["valued_non_trash"]))
    print("important_keys",len(json_metas[i]["important_keys"]))    

non_trash 88675
valued_non_trash 88675
important_keys 54542
non_trash 425587
valued_non_trash 425587
important_keys 296752
non_trash 20275
valued_non_trash 20275
important_keys 20268


In [32]:
%%time
Save(json_metas, json_metas_fn)

CPU times: user 49.8 s, sys: 800 ms, total: 50.6 s
Wall time: 50.6 s


## Spliting on folds

In [33]:
train_indexes = np.array_split(dft.index.unique(), nfolds)

dft = dft.drop([target_name], axis=1)

In [34]:
i = 0

for index in train_indexes:
    dfi = dft[dft.index.isin(index)]
    
    dfi_fn = train_folds_folder+"dfi{!s}.pkl".format(i)
    
    Save(dfi, dfi_fn)
    i = i+1
    
Save(train_indexes, train_indexes_fn)

In [35]:
dftst = dftst.sort_values(test_index_name)
dftst = dftst.drop([test_index_name], axis=1)
test_indexes = np.array_split(dftst.index.unique(), nfolds)

In [36]:
dftst.head(10)

Unnamed: 0_level_0,cat_feature,json1,json2,json3,dt_diff,id,cat_feature__length,cat_feature__maximum,cat_feature__mean,cat_feature__median,cat_feature__minimum,cat_feature__standard_deviation,cat_feature__sum_values,cat_feature__variance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
324768,5,"{""664922"":1,""812686"":2,""1479433"":1,""709068"":1,...","{""165949"":1,""362"":1,""260639"":5,""19776"":2,""1435...",{},1,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,0,"{""572620"":1,""159474"":1,""809001"":1,""1657077"":1,...","{""16370"":1,""87041"":1,""138296"":1,""143207"":1,""13...",{},17,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,5,"{""806039"":1,""1539087"":1,""809001"":2,""844920"":1,...","{""11835"":1,""586"":1,""74873"":1,""233680"":1,""20410...",{},18,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,0,"{""793814"":3,""844920"":2,""955650"":2,""806039"":1,""...","{""5631"":1,""22165"":2,""1078720"":2,""225454"":2,""91...",{},18,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,0,"{""1353000"":1,""1376123"":2,""2015951"":2,""1436354""...","{""5458"":1,""14681"":2,""120797"":1,""1078720"":2,""53...",{},19,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,5,"{""1353000"":1,""1376123"":2,""2015951"":2,""1436354""...","{""5458"":1,""14681"":2,""120797"":1,""1078720"":2,""53...",{},19,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,5,"{""1856814"":1,""955650"":1,""1539087"":1}","{""24624"":1,""1643"":1,""2399"":1,""235314"":1,""10034...",{},20,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,0,"{""1183551"":1,""868991"":1,""1331393"":1,""1856814"":...","{""47413"":1,""1643"":3,""938"":1,""17805"":1,""538"":1,...",{},20,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,5,"{""1006855"":1,""241767"":1,""1196877"":1,""1082055"":...","{""17382"":1,""5235"":1,""6700"":1,""1078720"":4,""5388...","{""595375"":1,""49144"":1,""7471"":1}",21,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578
324768,0,"{""1006855"":1,""241767"":1,""1196877"":1,""1082055"":...","{""17382"":1,""5235"":1,""6700"":1,""1078720"":4,""5388...","{""595375"":1,""49144"":1,""7471"":1}",21,324768,86.0,5.0,2.325581,0.0,0.0,2.493908,200.0,6.219578


In [37]:
i = 0

for index in test_indexes:
    dfi = dftst[dftst.index.isin(index)]
    
    dfi_fn = test_folds_folder+"dfi{!s}.pkl".format(i)
    
    Save(dfi, dfi_fn)
    i = i+1

Save(test_indexes, test_indexes_fn)