In [3]:
import sys
import logging
import math
import re

import glob
import os
import sys

import pandas as pd
import numpy as np

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

sys.path.insert(0, "/opt/vssexclude/personal/kaggle/k_tab_aug")

In [4]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [5]:
import src.config.constants as constants
import src.munging as process_data
import src.viz as viz
import src.common as common
import src.config.tsfresh_config as tsfresh_config

In [9]:
logger = logging.getLogger()

### List

In [5]:
file_names = ["mixed_1_set","symmetry_large_std_quantile_set","acf_pacf_set","cwt_coeff_set","change_quantile_set","fft_real_set","fft_imag_set","fft_abs_set","fft_angle_set","fft_agg_set","liner_agg_linear_set","mixed_2_set","mixed_3_set","mixed_4_set"]

In [6]:
file_names

['mixed_1_set',
 'symmetry_large_std_quantile_set',
 'acf_pacf_set',
 'cwt_coeff_set',
 'change_quantile_set',
 'fft_real_set',
 'fft_imag_set',
 'fft_abs_set',
 'fft_angle_set',
 'fft_agg_set',
 'liner_agg_linear_set',
 'mixed_2_set',
 'mixed_3_set',
 'mixed_4_set']

In [7]:
no_of_columns = 0
for name in file_names:
    df = pd.read_parquet(f"{constants.FEATURES_DATA_DIR}/cast/{name}_cast.parquet")
    print(f"Shape of {name}: {df.shape[1]}")
    no_of_columns = no_of_columns + df.shape[1]
no_of_columns

Shape of mixed_1_set: 44
Shape of symmetry_large_std_quantile_set: 47
Shape of acf_pacf_set: 23
Shape of cwt_coeff_set: 60
Shape of change_quantile_set: 60
Shape of fft_real_set: 100
Shape of fft_imag_set: 100
Shape of fft_abs_set: 100
Shape of fft_angle_set: 100
Shape of fft_agg_set: 4
Shape of liner_agg_linear_set: 53
Shape of mixed_2_set: 24
Shape of mixed_3_set: 30
Shape of mixed_4_set: 42


787

In [9]:
def select_features(df, features_to_drop):
    logger.info(f"Shape of the features {df.shape}")
    df = df.drop(features_to_drop, axis=1)
    logger.info(f"Shape of the features after dropping {df.shape}")
    return df

def load_data(name, features_to_drop):
    df = pd.read_parquet(f"{constants.FEATURES_DATA_DIR}/cast/{name}_cast.parquet")
    print(f"Shape of {name} before droipping {df.shape}")
    df = select_features(df, features_to_drop)
    print(f"Shape of {name} after droipping {df.shape}")
    return df

In [10]:
name = "mixed_1_set"
features_to_drop = ['loan__has_duplicate_min', 'loan__length', 'loan__sample_entropy']
df_mixed_1_set = load_data(name, features_to_drop)

name = "symmetry_large_std_quantile_set"
features_to_drop = features_to_drop = [
    "loan__symmetry_looking__r_0.0",
    "loan__symmetry_looking__r_0.1",
    "loan__symmetry_looking__r_0.15000000000000002",
    "loan__symmetry_looking__r_0.2",
    "loan__symmetry_looking__r_0.25",
    "loan__symmetry_looking__r_0.30000000000000004",
    "loan__symmetry_looking__r_0.35000000000000003",
    "loan__symmetry_looking__r_0.4",
    "loan__symmetry_looking__r_0.45",
    "loan__symmetry_looking__r_0.5",
    "loan__symmetry_looking__r_0.55",
    "loan__symmetry_looking__r_0.6000000000000001",
    "loan__symmetry_looking__r_0.65",
    "loan__symmetry_looking__r_0.7000000000000001",
    "loan__symmetry_looking__r_0.75",
    "loan__symmetry_looking__r_0.8",
    "loan__symmetry_looking__r_0.8500000000000001",
    "loan__symmetry_looking__r_0.9",
    "loan__symmetry_looking__r_0.9500000000000001",
    "loan__large_standard_deviation__r_0.05",
    "loan__large_standard_deviation__r_0.1",
    "loan__large_standard_deviation__r_0.15000000000000002",
    "loan__large_standard_deviation__r_0.30000000000000004",
    "loan__large_standard_deviation__r_0.35000000000000003",
    "loan__large_standard_deviation__r_0.4",
    "loan__large_standard_deviation__r_0.45",
    "loan__large_standard_deviation__r_0.5",
    "loan__large_standard_deviation__r_0.55",
    "loan__large_standard_deviation__r_0.6000000000000001",
    "loan__large_standard_deviation__r_0.65",
    "loan__large_standard_deviation__r_0.7000000000000001",
    "loan__large_standard_deviation__r_0.75",
    "loan__large_standard_deviation__r_0.8",
    "loan__large_standard_deviation__r_0.8500000000000001",
    "loan__large_standard_deviation__r_0.9",
    "loan__large_standard_deviation__r_0.9500000000000001",
]
df_sym = load_data(name, features_to_drop)

name = "acf_pacf_set"
features_to_drop = ["loan__partial_autocorrelation__lag_0"]
df_acf_pacf_set = load_data(name, features_to_drop)

name = "cwt_coeff_set"
features_to_drop = []
df_cwt_coeff_set = load_data(name, features_to_drop)

name = "change_quantile_set"
features_to_drop = []
df_change_quantile_set = load_data(name, features_to_drop)

name = "liner_agg_linear_set"
features_to_drop = [
    "loan__agg_linear_trend__attr_stderr__chunk_len_50__f_agg_max",
    "loan__agg_linear_trend__attr_stderr__chunk_len_50__f_agg_min",
    "loan__agg_linear_trend__attr_stderr__chunk_len_50__f_agg_mean",
    "loan__agg_linear_trend__attr_stderr__chunk_len_50__f_agg_var",
]
df_liner_agg_linear_set = load_data(name, features_to_drop)

name = "mixed_2_set"
features_to_drop = [
    "loan__count_above__t_0",
    "loan__query_similarity_count__query_None__threshold_00",
    "loan__matrix_profile__feature_min__threshold_098",
    "loan__matrix_profile__feature_max__threshold_098",
    "loan__matrix_profile__feature_mean__threshold_098",
    "loan__matrix_profile__feature_median__threshold_098",
    "loan__matrix_profile__feature_25__threshold_098",
    "loan__matrix_profile__feature_75__threshold_098",
]
df_mixed_2_set = load_data(name, features_to_drop)

name = "mixed_3_set"
features_to_drop = []
df_mixed_3_set = load_data(name, features_to_drop)

name = "mixed_4_set"
features_to_drop = [
        "loan__value_count__value_minus1",
        "loan__range_count__max_0__min_10000000000000",
        "loan__range_count__max_10000000000000__min_0",
        "loan__number_crossing_m__m_minus1",
        "loan__ratio_beyond_r_sigma__r_5",
        "loan__ratio_beyond_r_sigma__r_6",
        "loan__ratio_beyond_r_sigma__r_7",
        "loan__ratio_beyond_r_sigma__r_10",
]
df_mixed_4_set = load_data(name, features_to_drop)

name = "fft_real_set"
features_to_drop = [
    "loan__fft_coefficient__attr_real__coeff_51",
    "loan__fft_coefficient__attr_real__coeff_52",
    "loan__fft_coefficient__attr_real__coeff_53",
    "loan__fft_coefficient__attr_real__coeff_54",
    "loan__fft_coefficient__attr_real__coeff_55",
    "loan__fft_coefficient__attr_real__coeff_56",
    "loan__fft_coefficient__attr_real__coeff_57",
    "loan__fft_coefficient__attr_real__coeff_58",
    "loan__fft_coefficient__attr_real__coeff_59",
    "loan__fft_coefficient__attr_real__coeff_60",
    "loan__fft_coefficient__attr_real__coeff_61",
    "loan__fft_coefficient__attr_real__coeff_62",
    "loan__fft_coefficient__attr_real__coeff_63",
    "loan__fft_coefficient__attr_real__coeff_64",
    "loan__fft_coefficient__attr_real__coeff_65",
    "loan__fft_coefficient__attr_real__coeff_66",
    "loan__fft_coefficient__attr_real__coeff_67",
    "loan__fft_coefficient__attr_real__coeff_68",
    "loan__fft_coefficient__attr_real__coeff_69",
    "loan__fft_coefficient__attr_real__coeff_70",
    "loan__fft_coefficient__attr_real__coeff_71",
    "loan__fft_coefficient__attr_real__coeff_72",
    "loan__fft_coefficient__attr_real__coeff_73",
    "loan__fft_coefficient__attr_real__coeff_74",
    "loan__fft_coefficient__attr_real__coeff_75",
    "loan__fft_coefficient__attr_real__coeff_76",
    "loan__fft_coefficient__attr_real__coeff_77",
    "loan__fft_coefficient__attr_real__coeff_78",
    "loan__fft_coefficient__attr_real__coeff_79",
    "loan__fft_coefficient__attr_real__coeff_80",
    "loan__fft_coefficient__attr_real__coeff_81",
    "loan__fft_coefficient__attr_real__coeff_82",
    "loan__fft_coefficient__attr_real__coeff_83",
    "loan__fft_coefficient__attr_real__coeff_84",
    "loan__fft_coefficient__attr_real__coeff_85",
    "loan__fft_coefficient__attr_real__coeff_86",
    "loan__fft_coefficient__attr_real__coeff_87",
    "loan__fft_coefficient__attr_real__coeff_88",
    "loan__fft_coefficient__attr_real__coeff_89",
    "loan__fft_coefficient__attr_real__coeff_90",
    "loan__fft_coefficient__attr_real__coeff_91",
    "loan__fft_coefficient__attr_real__coeff_92",
    "loan__fft_coefficient__attr_real__coeff_93",
    "loan__fft_coefficient__attr_real__coeff_94",
    "loan__fft_coefficient__attr_real__coeff_95",
    "loan__fft_coefficient__attr_real__coeff_96",
    "loan__fft_coefficient__attr_real__coeff_97",
    "loan__fft_coefficient__attr_real__coeff_98",
    "loan__fft_coefficient__attr_real__coeff_99",
]
df_fft_real_set = load_data(name, features_to_drop)

name = "fft_imag_set"
features_to_drop = [
    "loan__fft_coefficient__attr_imag__coeff_0",
    "loan__fft_coefficient__attr_imag__coeff_50",
    "loan__fft_coefficient__attr_imag__coeff_51",
    "loan__fft_coefficient__attr_imag__coeff_52",
    "loan__fft_coefficient__attr_imag__coeff_53",
    "loan__fft_coefficient__attr_imag__coeff_54",
    "loan__fft_coefficient__attr_imag__coeff_55",
    "loan__fft_coefficient__attr_imag__coeff_56",
    "loan__fft_coefficient__attr_imag__coeff_57",
    "loan__fft_coefficient__attr_imag__coeff_58",
    "loan__fft_coefficient__attr_imag__coeff_59",
    "loan__fft_coefficient__attr_imag__coeff_60",
    "loan__fft_coefficient__attr_imag__coeff_61",
    "loan__fft_coefficient__attr_imag__coeff_62",
    "loan__fft_coefficient__attr_imag__coeff_63",
    "loan__fft_coefficient__attr_imag__coeff_64",
    "loan__fft_coefficient__attr_imag__coeff_65",
    "loan__fft_coefficient__attr_imag__coeff_66",
    "loan__fft_coefficient__attr_imag__coeff_67",
    "loan__fft_coefficient__attr_imag__coeff_68",
    "loan__fft_coefficient__attr_imag__coeff_69",
    "loan__fft_coefficient__attr_imag__coeff_70",
    "loan__fft_coefficient__attr_imag__coeff_71",
    "loan__fft_coefficient__attr_imag__coeff_72",
    "loan__fft_coefficient__attr_imag__coeff_73",
    "loan__fft_coefficient__attr_imag__coeff_74",
    "loan__fft_coefficient__attr_imag__coeff_75",
    "loan__fft_coefficient__attr_imag__coeff_76",
    "loan__fft_coefficient__attr_imag__coeff_77",
    "loan__fft_coefficient__attr_imag__coeff_78",
    "loan__fft_coefficient__attr_imag__coeff_79",
    "loan__fft_coefficient__attr_imag__coeff_80",
    "loan__fft_coefficient__attr_imag__coeff_81",
    "loan__fft_coefficient__attr_imag__coeff_82",
    "loan__fft_coefficient__attr_imag__coeff_83",
    "loan__fft_coefficient__attr_imag__coeff_84",
    "loan__fft_coefficient__attr_imag__coeff_85",
    "loan__fft_coefficient__attr_imag__coeff_86",
    "loan__fft_coefficient__attr_imag__coeff_87",
    "loan__fft_coefficient__attr_imag__coeff_88",
    "loan__fft_coefficient__attr_imag__coeff_89",
    "loan__fft_coefficient__attr_imag__coeff_90",
    "loan__fft_coefficient__attr_imag__coeff_91",
    "loan__fft_coefficient__attr_imag__coeff_92",
    "loan__fft_coefficient__attr_imag__coeff_93",
    "loan__fft_coefficient__attr_imag__coeff_94",
    "loan__fft_coefficient__attr_imag__coeff_95",
    "loan__fft_coefficient__attr_imag__coeff_96",
    "loan__fft_coefficient__attr_imag__coeff_97",
    "loan__fft_coefficient__attr_imag__coeff_98",
    "loan__fft_coefficient__attr_imag__coeff_99",
]
df_fft_imag_set = load_data(name, features_to_drop)

name = "fft_abs_set"
features_to_drop = [
    "loan__fft_coefficient__attr_abs__coeff_51",
    "loan__fft_coefficient__attr_abs__coeff_52",
    "loan__fft_coefficient__attr_abs__coeff_53",
    "loan__fft_coefficient__attr_abs__coeff_54",
    "loan__fft_coefficient__attr_abs__coeff_55",
    "loan__fft_coefficient__attr_abs__coeff_56",
    "loan__fft_coefficient__attr_abs__coeff_57",
    "loan__fft_coefficient__attr_abs__coeff_58",
    "loan__fft_coefficient__attr_abs__coeff_59",
    "loan__fft_coefficient__attr_abs__coeff_60",
    "loan__fft_coefficient__attr_abs__coeff_61",
    "loan__fft_coefficient__attr_abs__coeff_62",
    "loan__fft_coefficient__attr_abs__coeff_63",
    "loan__fft_coefficient__attr_abs__coeff_64",
    "loan__fft_coefficient__attr_abs__coeff_65",
    "loan__fft_coefficient__attr_abs__coeff_66",
    "loan__fft_coefficient__attr_abs__coeff_67",
    "loan__fft_coefficient__attr_abs__coeff_68",
    "loan__fft_coefficient__attr_abs__coeff_69",
    "loan__fft_coefficient__attr_abs__coeff_70",
    "loan__fft_coefficient__attr_abs__coeff_71",
    "loan__fft_coefficient__attr_abs__coeff_72",
    "loan__fft_coefficient__attr_abs__coeff_73",
    "loan__fft_coefficient__attr_abs__coeff_74",
    "loan__fft_coefficient__attr_abs__coeff_75",
    "loan__fft_coefficient__attr_abs__coeff_76",
    "loan__fft_coefficient__attr_abs__coeff_77",
    "loan__fft_coefficient__attr_abs__coeff_78",
    "loan__fft_coefficient__attr_abs__coeff_79",
    "loan__fft_coefficient__attr_abs__coeff_80",
    "loan__fft_coefficient__attr_abs__coeff_81",
    "loan__fft_coefficient__attr_abs__coeff_82",
    "loan__fft_coefficient__attr_abs__coeff_83",
    "loan__fft_coefficient__attr_abs__coeff_84",
    "loan__fft_coefficient__attr_abs__coeff_85",
    "loan__fft_coefficient__attr_abs__coeff_86",
    "loan__fft_coefficient__attr_abs__coeff_87",
    "loan__fft_coefficient__attr_abs__coeff_88",
    "loan__fft_coefficient__attr_abs__coeff_89",
    "loan__fft_coefficient__attr_abs__coeff_90",
    "loan__fft_coefficient__attr_abs__coeff_91",
    "loan__fft_coefficient__attr_abs__coeff_92",
    "loan__fft_coefficient__attr_abs__coeff_93",
    "loan__fft_coefficient__attr_abs__coeff_94",
    "loan__fft_coefficient__attr_abs__coeff_95",
    "loan__fft_coefficient__attr_abs__coeff_96",
    "loan__fft_coefficient__attr_abs__coeff_97",
    "loan__fft_coefficient__attr_abs__coeff_98",
    "loan__fft_coefficient__attr_abs__coeff_99",
]
df_fft_abs_set = load_data(name, features_to_drop)

name = "fft_angle_set"
features_to_drop = [
    "loan__fft_coefficient__attr_angle__coeff_0",
    "loan__fft_coefficient__attr_angle__coeff_51",
    "loan__fft_coefficient__attr_angle__coeff_52",
    "loan__fft_coefficient__attr_angle__coeff_53",
    "loan__fft_coefficient__attr_angle__coeff_54",
    "loan__fft_coefficient__attr_angle__coeff_55",
    "loan__fft_coefficient__attr_angle__coeff_56",
    "loan__fft_coefficient__attr_angle__coeff_57",
    "loan__fft_coefficient__attr_angle__coeff_58",
    "loan__fft_coefficient__attr_angle__coeff_59",
    "loan__fft_coefficient__attr_angle__coeff_60",
    "loan__fft_coefficient__attr_angle__coeff_61",
    "loan__fft_coefficient__attr_angle__coeff_62",
    "loan__fft_coefficient__attr_angle__coeff_63",
    "loan__fft_coefficient__attr_angle__coeff_64",
    "loan__fft_coefficient__attr_angle__coeff_65",
    "loan__fft_coefficient__attr_angle__coeff_66",
    "loan__fft_coefficient__attr_angle__coeff_67",
    "loan__fft_coefficient__attr_angle__coeff_68",
    "loan__fft_coefficient__attr_angle__coeff_69",
    "loan__fft_coefficient__attr_angle__coeff_70",
    "loan__fft_coefficient__attr_angle__coeff_71",
    "loan__fft_coefficient__attr_angle__coeff_72",
    "loan__fft_coefficient__attr_angle__coeff_73",
    "loan__fft_coefficient__attr_angle__coeff_74",
    "loan__fft_coefficient__attr_angle__coeff_75",
    "loan__fft_coefficient__attr_angle__coeff_76",
    "loan__fft_coefficient__attr_angle__coeff_77",
    "loan__fft_coefficient__attr_angle__coeff_78",
    "loan__fft_coefficient__attr_angle__coeff_79",
    "loan__fft_coefficient__attr_angle__coeff_80",
    "loan__fft_coefficient__attr_angle__coeff_81",
    "loan__fft_coefficient__attr_angle__coeff_82",
    "loan__fft_coefficient__attr_angle__coeff_83",
    "loan__fft_coefficient__attr_angle__coeff_84",
    "loan__fft_coefficient__attr_angle__coeff_85",
    "loan__fft_coefficient__attr_angle__coeff_86",
    "loan__fft_coefficient__attr_angle__coeff_87",
    "loan__fft_coefficient__attr_angle__coeff_88",
    "loan__fft_coefficient__attr_angle__coeff_89",
    "loan__fft_coefficient__attr_angle__coeff_90",
    "loan__fft_coefficient__attr_angle__coeff_91",
    "loan__fft_coefficient__attr_angle__coeff_92",
    "loan__fft_coefficient__attr_angle__coeff_93",
    "loan__fft_coefficient__attr_angle__coeff_94",
    "loan__fft_coefficient__attr_angle__coeff_95",
    "loan__fft_coefficient__attr_angle__coeff_96",
    "loan__fft_coefficient__attr_angle__coeff_97",
    "loan__fft_coefficient__attr_angle__coeff_98",
    "loan__fft_coefficient__attr_angle__coeff_99",
]
df_fft_angle_set = load_data(name, features_to_drop)

Shape of mixed_1_set before droipping (400000, 44)
Shape of mixed_1_set after droipping (400000, 41)
Shape of symmetry_large_std_quantile_set before droipping (400000, 47)
Shape of symmetry_large_std_quantile_set after droipping (400000, 11)
Shape of acf_pacf_set before droipping (400000, 23)
Shape of acf_pacf_set after droipping (400000, 22)
Shape of cwt_coeff_set before droipping (400000, 60)
Shape of cwt_coeff_set after droipping (400000, 60)
Shape of change_quantile_set before droipping (400000, 60)
Shape of change_quantile_set after droipping (400000, 60)
Shape of liner_agg_linear_set before droipping (400000, 53)
Shape of liner_agg_linear_set after droipping (400000, 49)
Shape of mixed_2_set before droipping (400000, 24)
Shape of mixed_2_set after droipping (400000, 16)
Shape of mixed_3_set before droipping (400000, 30)
Shape of mixed_3_set after droipping (400000, 30)
Shape of mixed_4_set before droipping (400000, 42)
Shape of mixed_4_set after droipping (400000, 34)
Shape of ff

In [11]:
dfs= [df_acf_pacf_set, df_change_quantile_set, df_cwt_coeff_set, df_fft_abs_set, df_fft_angle_set, df_fft_imag_set, df_fft_real_set, df_liner_agg_linear_set, df_mixed_1_set, df_mixed_2_set, df_mixed_3_set, df_mixed_4_set, df_sym]

result_df = pd.concat(dfs, axis=1)

In [13]:
result_df.to_parquet(f"{constants.FEATURES_DATA_DIR}/cast/tsfresh_f_merged.parquet", index=True)

In [22]:
features_df = pd.read_parquet(f"{constants.FEATURES_DATA_DIR}/cast/tsfresh_f_merged.parquet")

In [23]:
train_df, test_df, sample_submission_df = process_data.read_processed_data(
    logger,
    constants.PROCESSED_DATA_DIR,
    train=True,
    test=True,
    sample_submission=True,
)

In [24]:
len(train_df)

250000

In [25]:
train_df_f = features_df.iloc[0: len(train_df)]

In [28]:
train_df_f["loss"] = train_df.loss

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_f["loss"] = train_df.loss


In [30]:
c = train_df_f.corr()

In [36]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [37]:
c["loss"].sort_values(ascending=False)

loss                                                                1.000
loan__fft_coefficient__attr_imag__coeff_21                          0.025
loan__fft_coefficient__attr_real__coeff_13                          0.021
loan__fft_coefficient__attr_real__coeff_37                          0.020
loan__fft_coefficient__attr_imag__coeff_37                          0.020
loan__fft_coefficient__attr_real__coeff_31                          0.019
loan__fft_coefficient__attr_real__coeff_46                          0.018
loan__fft_coefficient__attr_imag__coeff_43                          0.017
loan__fft_coefficient__attr_angle__coeff_21                         0.017
loan__cwt_coefficients__coeff_0__w_2__widths_251020                 0.016
loan__energy_ratio_by_chunks__num_segments_10__segment_focus_7      0.016
loan__cwt_coefficients__coeff_14__w_5__widths_251020                0.015
loan__agg_linear_trend__attr_intercept__chunk_len_5__f_agg_min      0.015
loan__fft_coefficient__attr_real__coef