In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVC, SVC
from sklearn.metrics import accuracy_score, confusion_matrix
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime

import json
import ast
import eli5
import shap
from eli5.sklearn import PermutationImportance
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split
from sklearn.linear_model import Ridge, RidgeCV
import gc
from catboost import CatBoostClassifier
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import altair as alt
from  altair.vega import v3
from IPython.display import HTML
from sklearn.linear_model import LinearRegression

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge

In [2]:
train = pd.read_csv('../input/X_train.csv')
test = pd.read_csv('../input/X_test.csv')
y_train = pd.read_csv('../input/y_train.csv')
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [3]:
sub = pd.read_csv('../input/sample_submission.csv')
test_df = sub[['series_id']]

In [4]:
def calc_change_rate(x):
    change = (np.diff(x) / x[:-1]).values
    change = change[np.nonzero(change)[0]]
    change = change[~np.isnan(change)]
    change = change[change != -np.inf]
    change = change[change != np.inf]
    return np.mean(change)

def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

def classic_sta_lta(x, length_sta, length_lta):
    
    sta = np.cumsum(x ** 2)

    # Convert to float
    sta = np.require(sta, dtype=np.float)

    # Copy for LTA
    lta = sta.copy()

    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta

    # Pad zeros
    sta[:length_lta - 1] = 0

    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny

    return sta / lta

In [5]:
train_df = train[['series_id']].drop_duplicates().reset_index(drop=True)
train_df = train_df.drop(['series_id'], axis=1)
# test_df = test_df.drop(['series_id'], axis=1)


In [6]:
train_df.fillna(0, inplace = True)
test_df.fillna(0, inplace = True)
train_df.replace(-np.inf, 0, inplace = True)
train_df.replace(np.inf, 0, inplace = True)
test_df.replace(-np.inf, 0, inplace = True)
test_df.replace(np.inf, 0, inplace = True)
train_labels= y_train['surface']
# train_df.isnull().values.any()
# train_df.fillna(train_df.mean())
# train_df = train_df.astype(np.float64)
# train_df.dtypes


In [7]:
for col in tqdm_notebook(train.columns[3:]):
    train_df[col + '_mean'] = train.groupby(['series_id'])[col].mean()
    train_df[col + '_std'] = train.groupby(['series_id'])[col].std()
    train_df[col + '_max'] = train.groupby(['series_id'])[col].max()
    train_df[col + '_min'] = train.groupby(['series_id'])[col].min()
    train_df[col + '_max_to_min'] = train_df[col + '_max'] / train_df[col + '_min']

    for i in train_df['series_id']:
        train_df.loc[i, col + '_mean_change_abs'] = np.mean(np.diff(train.loc[train['series_id'] == i, col]))
        train_df.loc[i, col + '_mean_change_rate'] = calc_change_rate(train.loc[train['series_id'] == i, col])
        
        train_df.loc[i, col + '_q95'] = np.quantile(train.loc[train['series_id'] == i, col], 0.95)
        train_df.loc[i, col + '_q99'] = np.quantile(train.loc[train['series_id'] == i, col], 0.99)
        train_df.loc[i, col + '_q05'] = np.quantile(train.loc[train['series_id'] == i, col], 0.05)
        
        train_df.loc[i, col + '_abs_min'] = np.abs(train.loc[train['series_id'] == i, col]).min()
        train_df.loc[i, col + '_abs_max'] = np.abs(train.loc[train['series_id'] == i, col]).max()
        
        train_df.loc[i, col + '_trend'] = add_trend_feature(train.loc[train['series_id'] == i, col].values)
        train_df.loc[i, col + '_abs_trend'] = add_trend_feature(train.loc[train['series_id'] == i, col].values, abs_values=True)
        train_df.loc[i, col + '_abs_mean'] = np.abs(train.loc[train['series_id'] == i, col]).mean()
        train_df.loc[i, col + '_abs_std'] = np.abs(train.loc[train['series_id'] == i, col]).std()
        
        train_df.loc[i, col + '_mad'] = train.loc[train['series_id'] == i, col].mad()
        train_df.loc[i, col + '_kurt'] = train.loc[train['series_id'] == i, col].kurtosis()
        train_df.loc[i, col + '_skew'] = train.loc[train['series_id'] == i, col].skew()
        train_df.loc[i, col + '_med'] = train.loc[train['series_id'] == i, col].median()
        
        train_df.loc[i, col + '_Hilbert_mean'] = np.abs(hilbert(train.loc[train['series_id'] == i, col])).mean()
        
        train_df.loc[i, col + '_Hann_window_mean'] = (convolve(train.loc[train['series_id'] == i, col], hann(15), mode='same') / sum(hann(15))).mean()
        train_df.loc[i, col + '_classic_sta_lta1_mean'] = classic_sta_lta(train.loc[train['series_id'] == i, col], 10, 50).mean()

        train_df.loc[i, col + '_Moving_average_10_mean'] = train.loc[train['series_id'] == i, col].rolling(window=10).mean().mean(skipna=True)
        train_df.loc[i, col + '_Moving_average_16_mean'] = train.loc[train['series_id'] == i, col].rolling(window=16).mean().mean(skipna=True)
        train_df.loc[i, col + '_Moving_average_10_std'] = train.loc[train['series_id'] == i, col].rolling(window=10).std().mean(skipna=True)
        train_df.loc[i, col + '_Moving_average_16_std'] = train.loc[train['series_id'] == i, col].rolling(window=16).std().mean(skipna=True)
        
        train_df.loc[i, col + 'iqr'] = np.subtract(*np.percentile(train.loc[train['series_id'] == i, col], [75, 25]))
        train_df.loc[i, col + 'ave10'] = stats.trim_mean(train.loc[train['series_id'] == i, col], 0.1)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




KeyError: 'series_id'

In [8]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)

In [9]:
random_forest.fit(train_df, train_labels)
# predictions = random_forest.predict_proba(test_df)[:,1]
# predict_class = random_forest.predict(test_df)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=50, verbose=0, warm_start=False)

In [10]:
for col in tqdm_notebook(test.columns[3:]):
    test_df[col + '_mean'] = test.groupby(['series_id'])[col].mean()
    test_df[col + '_std'] = test.groupby(['series_id'])[col].std()
    test_df[col + '_max'] = test.groupby(['series_id'])[col].max()
    test_df[col + '_min'] = test.groupby(['series_id'])[col].min()
    test_df[col + '_max_to_min'] = test_df[col + '_max'] / test_df[col + '_min']

    for i in test_df['series_id']:
        test_df.loc[i, col + '_mean_change_abs'] = np.mean(np.diff(test.loc[test['series_id'] == i, col]))
        test_df.loc[i, col + '_mean_change_rate'] = calc_change_rate(test.loc[test['series_id'] == i, col])
        
        test_df.loc[i, col + '_q95'] = np.quantile(test.loc[test['series_id'] == i, col], 0.95)
        test_df.loc[i, col + '_q99'] = np.quantile(test.loc[test['series_id'] == i, col], 0.99)
        test_df.loc[i, col + '_q05'] = np.quantile(test.loc[test['series_id'] == i, col], 0.05)
        
        test_df.loc[i, col + '_abs_min'] = np.abs(test.loc[test['series_id'] == i, col]).min()
        test_df.loc[i, col + '_abs_max'] = np.abs(test.loc[test['series_id'] == i, col]).max()
        
        test_df.loc[i, col + '_trend'] = add_trend_feature(test.loc[test['series_id'] == i, col].values)
        test_df.loc[i, col + '_abs_trend'] = add_trend_feature(test.loc[test['series_id'] == i, col].values, abs_values=True)
        test_df.loc[i, col + '_abs_mean'] = np.abs(test.loc[test['series_id'] == i, col]).mean()
        test_df.loc[i, col + '_abs_std'] = np.abs(test.loc[test['series_id'] == i, col]).std()
        
        test_df.loc[i, col + '_mad'] = test.loc[test['series_id'] == i, col].mad()
        test_df.loc[i, col + '_kurt'] = test.loc[test['series_id'] == i, col].kurtosis()
        test_df.loc[i, col + '_skew'] = test.loc[test['series_id'] == i, col].skew()
        test_df.loc[i, col + '_med'] = test.loc[test['series_id'] == i, col].median()
        
        test_df.loc[i, col + '_Hilbert_mean'] = np.abs(hilbert(test.loc[test['series_id'] == i, col])).mean()
        
        test_df.loc[i, col + '_Hann_window_mean'] = (convolve(test.loc[test['series_id'] == i, col], hann(15), mode='same') / sum(hann(15))).mean()
        test_df.loc[i, col + '_classic_sta_lta1_mean'] = classic_sta_lta(test.loc[test['series_id'] == i, col], 10, 50).mean()

        test_df.loc[i, col + '_Moving_average_10_mean'] = test.loc[test['series_id'] == i, col].rolling(window=10).mean().mean(skipna=True)
        test_df.loc[i, col + '_Moving_average_16_mean'] = test.loc[test['series_id'] == i, col].rolling(window=16).mean().mean(skipna=True)
        test_df.loc[i, col + '_Moving_average_10_std'] = test.loc[test['series_id'] == i, col].rolling(window=10).std().mean(skipna=True)
        test_df.loc[i, col + '_Moving_average_16_std'] = test.loc[test['series_id'] == i, col].rolling(window=16).std().mean(skipna=True)
        
        test_df.loc[i, col + 'iqr'] = np.subtract(*np.percentile(test.loc[test['series_id'] == i, col], [75, 25]))
        test_df.loc[i, col + 'ave10'] = stats.trim_mean(test.loc[test['series_id'] == i, col], 0.1)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [11]:
submit_df = pd.DataFrame(test.reset_index()['series_id'],columns=['series_id'])
submit_df['surface']= predict_class

NameError: name 'predict_class' is not defined

In [12]:
from IPython.display import HTML
import base64

In [13]:
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [14]:
create_download_link(submit_df, filename="8_andrew_lukyanenko_random_forest.csv")
