# Introduction
Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! :)

EDA was done in this [notebook](https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-eda-fe-baseline)</br>
Pseudo Labeling was done in this [notebook](https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-fe-pseudo-labels-baseline)</br>
Prediction without NN is in this [notebook](https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-optuna-lgbm-blend)

In [46]:
import pandas as pd
import numpy as np
import warnings 

warnings.simplefilter("ignore")
# indexを指定しないと、PyCaretの予測結果がめちゃくちゃになる?
train = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv", index_col=['sequence', 'subject', 'step'])
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv", index_col=['sequence', 'subject', 'step'])
test_pseudo = pd.read_csv("../input/tpsapr22-pseudo-labels/pseudo_labeled_test.csv")
labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv", index_col=['sequence'])

display(train.head())
display(test.head())
display(labels.head())
display(sub.head())

In [47]:
th = 0.95
test_pseudo_selected = test_pseudo.loc[((test_pseudo['state_proba']>=th) | (test_pseudo['state_proba']<=(1 - th))), 
                                      ['sequence', 'state_proba']]
test_pseudo_selected.columns = ['sequence', 'state']
test_pseudo_selected['state'] = test_pseudo_selected['state'].round()
test_pseudo_selected.head()

# Feature Engineering

In [48]:
# 予測するものはsequence毎のstateだが、各sequenceは60stepのデータを持つため、統計量で集約する
from scipy.stats import kurtosis
def kurtosis_func(series):
    '''
    Describe something...
    '''
    return kurtosis(series)

def q01(series):
    return np.quantile(series, 0.01)

def q05(series):
    return np.quantile(series, 0.05)

def q95(series):
    return np.quantile(series, 0.95)

def q99(series):
    return np.quantile(series, 0.99)

def aggregated_features(df, aggregation_cols = ['sequence'], prefix = ''):
    agg_strategy = {'sensor_00': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_01': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_02': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_03': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_04': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_05': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_06': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_07': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_08': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_09': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_10': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_11': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_12': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                   }
    group = df.groupby(aggregation_cols).aggregate(agg_strategy)
    group.columns = ['_'.join(col).strip() for col in group.columns]
    group.columns = [str(prefix) + str(col) for col in group.columns]
    group.reset_index(inplace = True)
    
    temp = (df.groupby(aggregation_cols).size().reset_index(name = str(prefix) + 'size'))
    group = pd.merge(temp, group, how = 'left', on = aggregation_cols,)
    return group

* PyCaretを使ってみる

In [49]:
!pip install pycaret -full
import pycaret
from pycaret.classification import *
from pycaret.classification import setup, compare_models, blend_models, finalize_model, predict_model, plot_model

In [50]:
train = aggregated_features(train, aggregation_cols = ['sequence', 'subject'])
test = aggregated_features(test, aggregation_cols = ['sequence', 'subject'])

In [51]:
# 予測対象であるstateをくっつける
train_df_label = pd.merge(train, labels, on="sequence")

In [52]:
train_df_label.tail()

In [53]:
clf1 = setup(data = train_df_label,
             target = 'state',  # 目標変数
             ignore_features =['sequence'],
             numeric_imputation = 'mean',  # 欠損値は平均で補完
             fold_strategy = 'groupkfold',
             fold_groups = 'sequence',
             use_gpu = True,
             fold = 3,
             remove_outliers=True,
             normalize = True, 
             normalize_method = 'minmax',
             silent = True)

In [54]:
compare_models()

In [55]:
lightgbm_model = create_model('lightgbm')

In [56]:
interpret_model(lightgbm_model)

In [57]:
final_model = finalize_model(lightgbm_model)
pred = predict_model(final_model, data=test)

In [2]:
pred.head()

In [59]:
sub.head()

In [60]:
pred['Label']

In [1]:
sub['state'] = pred['Label']

In [61]:
sub.to_csv('submission.csv', index=False)
sub