In [1]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, RidgeCV
import gc
from catboost import CatBoostRegressor
import seaborn as sns

In [5]:
def preprocess_train(seg_id, train):
    d = {'sma_10' : [0],
         'ewma_10' : [0],
         'ave' : [0],
         'std' : [0],
         'max' : [0],
         'min' : [0],
         'time_to_failure' : [0]}
    result = pd.DataFrame(d, dtype=np.float64)
    
    x = train["acoustic_data"].values
    
    train["sma_3"] = train["acoustic_data"].rolling(window = 3).mean()
    train['ewma_3'] = pd.Series.ewm(train["acoustic_data"], span=3).mean()
    train["sma_5"] = train["acoustic_data"].rolling(window = 5).mean()
    train['ewma_5'] = pd.Series.ewm(train["acoustic_data"], span=5).mean()
    train["sma_10"] = train["acoustic_data"].rolling(window = 10).mean()
    train['ewma_10'] = pd.Series.ewm(train["acoustic_data"], span=10).mean()
    train["sma_50"] = train["acoustic_data"].rolling(window = 50).mean()
    train['ewma_50'] = pd.Series.ewm(train["acoustic_data"], span=50).mean()
    train["sma_100"] = train["acoustic_data"].rolling(window = 100).mean()
    train['ewma_100'] = pd.Series.ewm(train["acoustic_data"], span=100).mean()
    
    result['time_to_failure'] = train["time_to_failure"].values[-1]
    result["seg_id"] = seg_id
    result["ewma_3"] = train["ewma_3"].mean()
    result["sma_3"] = train["sma_3"].mean()
    result["ewma_5"] = train["ewma_5"].mean()
    result["sma_5"] = train["sma_5"].mean()
    result["ewma_10"] = train["ewma_10"].mean()
    result["sma_10"] = train["sma_10"].mean()
    result["ewma_50"] = train["ewma_50"].mean()
    result["sma_50"] = train["sma_50"].mean()
    result["ewma_100"] = train["ewma_100"].mean()
    result["sma_100"] = train["sma_100"].mean()
    result["ewma_3_last"] = train.loc[train.index[-1], "ewma_3"]
    result["sma_3_last"] = train.loc[train.index[-1], "sma_3"]
    result["ewma_5_last"] = train.loc[train.index[-1], "ewma_5"]
    result["sma_5_last"] = train.loc[train.index[-1], "sma_5"]
    result["ewma_10_last"] = train.loc[train.index[-1], "ewma_10"]
    result["sma_10_last"] = train.loc[train.index[-1], "sma_10"]
    result["ewma_50_last"] = train.loc[train.index[-1], "ewma_50"]
    result["sma_50_last"] = train.loc[train.index[-1], "sma_50"]
    result["ewma_100_last"] = train.loc[train.index[-1], "ewma_100"]
    result["sma_100_last"] = train.loc[train.index[-1], "sma_100"]
    result['ave'] = x.mean()
    result['std'] = x.std()
    result['max'] = x.max()
    result['min'] = x.min()
    result['q01'] = np.quantile(x,0.01)
    result['q05'] = np.quantile(x,0.05)
    result['q95'] = np.quantile(x,0.95)
    result['q99'] = np.quantile(x,0.99)
    

    return result

In [6]:
frames = []
rows = 150_000
segments = 1
#train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

for df in pd.read_csv('../input/train.csv', chunksize=rows, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}):
    _ = preprocess_train(segments, df)
    frames.append(_)
    segments += 1

X_train = pd.concat(frames)
X_train = X_train.set_index("seg_id")

In [7]:
X_train.head()

Unnamed: 0_level_0,sma_10,ewma_10,ave,std,max,min,time_to_failure,ewma_3,sma_3,ewma_5,sma_5,ewma_50,sma_50,ewma_100,sma_100,ewma_3_last,sma_3_last,ewma_5_last,sma_5_last,ewma_10_last,sma_10_last,ewma_50_last,sma_50_last,ewma_100_last,sma_100_last,q01,q05,q95,q99
seg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
1,4.884088378635989,4.884241616265863,4.884113333333334,5.101089126891323,104,-98,1.4307971859,4.88416746243538,4.884107343653485,4.884201080817154,4.884095575881966,4.884270544770727,4.883969296636922,4.884335546276895,4.88386354994299,1.728049414019365,2.666666666666667,2.508857052871273,3.0,3.699439328235981,4.4,5.14777368465166,5.08,5.074976099715364,5.22,-8.0,-2.0,11.0,18.0
2,4.725731543892671,4.725763269986679,4.725766666666667,6.588801819164257,181,-154,1.3914988931,4.725763863125676,4.72575634341787,4.725760922746515,4.725730019467171,4.725809832481752,4.725729338250502,4.725842751947517,4.725623311385454,5.847922518235241,6.333333333333333,5.785861082950227,6.2,5.290293656235559,4.8,4.782703153578837,4.76,4.819991534413204,5.02,-11.0,-2.0,12.0,21.0
3,4.906229040409174,4.906390489500286,4.906393333333333,6.967373808828945,140,-106,1.3531960947,4.906395764152028,4.906394307479615,4.906392510202864,4.906374836662358,4.906429776004419,4.906072250268403,4.906416869434004,4.906088151513369,5.289966499168143,5.333333333333333,6.263221040726829,7.4,6.79561130419279,7.8,5.915605812051815,5.72,5.612059494272287,5.54,-15.0,-3.0,13.0,26.0
4,4.902290804114878,4.902126943248564,4.90224,6.922282112791032,197,-199,1.3137978019,4.902213081352357,4.902254252278854,4.902180699534021,4.902297394597307,4.902043049787403,4.902059206007244,4.902052266336941,4.901946217837145,5.881369409386302,5.333333333333333,6.166403212899276,6.8,6.272398460567697,6.5,5.631434942554828,5.5,5.358835646046716,5.12,-12.0,-2.0,12.0,22.0
5,4.908727857004753,4.908785382625916,4.90872,7.301085852684289,145,-126,1.2743995091,4.908729341458675,4.908658782117159,4.908744462593931,4.908658897570609,4.908880179258194,4.908957859567479,4.908872987383111,4.909174788693883,7.607461545685376,7.333333333333333,5.929508508329639,5.0,4.222870197071386,2.5,3.914656499374436,3.86,4.231041225100518,4.44,-15.0,-2.0,12.0,26.0


In [8]:
X_train.shape

(4195, 29)

In [9]:
params = {'num_leaves': 54,
          'min_data_in_leaf': 79,
          'objective': 'huber',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting": "gbdt",
          "bagging_freq": 3,
          "bagging_fraction": 0.8126672064208567,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 1.1302650970728192,
          'reg_lambda': 0.3603427518866501,
          'num_iterations' : 5000}

In [10]:
train_features = list()
TARGET = 'time_to_failure'
train_features = [f for f in X_train.columns if f != TARGET]

In [11]:
def sma (x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    temp = (cumsum[N:] - cumsum[:-N]) / float(N)
    result = np.zeros(len(x) - len(temp))
    result = np.concatenate((result, temp))
    #result = np.reshape(result, (len(result),1))
    return result

In [13]:
def preprocess(seg_id):
    d = {'sma_10' : [0],
         'ewma_10' : [0],
         'ave' : [0],
         'std' : [0],
         'max' : [0],
         'min' : [0]}
    test = pd.read_csv('../input/test/' + seg_id + '.csv')
    
    x = test["acoustic_data"].values
    
    test["sma_3"] = test["acoustic_data"].rolling(window = 3).mean()
    test['ewma_3'] = pd.Series.ewm(test["acoustic_data"], span=3).mean()
    test["sma_5"] = test["acoustic_data"].rolling(window = 5).mean()
    test['ewma_5'] = pd.Series.ewm(test["acoustic_data"], span=5).mean()
    test["sma_10"] = test["acoustic_data"].rolling(window = 10).mean()
    test['ewma_10'] = pd.Series.ewm(test["acoustic_data"], span=10).mean()
    test["sma_50"] = test["acoustic_data"].rolling(window = 50).mean()
    test['ewma_50'] = pd.Series.ewm(test["acoustic_data"], span=50).mean()
    test["sma_100"] = test["acoustic_data"].rolling(window = 100).mean()
    test['ewma_100'] = pd.Series.ewm(test["acoustic_data"], span=100).mean()
    
    
    result = pd.DataFrame(d, dtype=np.float64)
    result["seg_id"] = seg_id
    result["sma_3"] = test["sma_3"].mean()
    result["ewma_3"] = test["ewma_3"].mean()
    result["sma_5"] = test["sma_5"].mean()
    result["ewma_5"] = test["ewma_5"].mean()
    result["sma_10"] = test["sma_10"].mean()
    result["ewma_10"] = test["ewma_10"].mean()
    result["sma_50"] = test["sma_50"].mean()
    result["ewma_50"] = test["ewma_50"].mean()
    result["sma_100"] = test["sma_100"].mean()
    result["ewma_100"] = test["ewma_100"].mean()
    result["sma_3_last"] = test.loc[test.index[-1], "sma_3"]
    result["ewma_3_last"] = test.loc[test.index[-1], "ewma_3"]
    result["sma_5_last"] = test.loc[test.index[-1], "sma_5"]
    result["ewma_5_last"] = test.loc[test.index[-1], "ewma_5"]
    result["sma_10_last"] = test.loc[test.index[-1], "sma_10"]
    result["ewma_10_last"] = test.loc[test.index[-1], "ewma_10"]
    result["sma_50_last"] = test.loc[test.index[-1], "sma_50"]
    result["ewma_50_last"] = test.loc[test.index[-1], "ewma_50"]
    result["sma_100_last"] = test.loc[test.index[-1], "sma_100"]
    result["ewma_100_last"] = test.loc[test.index[-1], "ewma_100"]
    result['ave'] = x.mean()
    result['std'] = x.std()
    result['max'] = x.max()
    result['min'] = x.min()
    result['q01'] = np.quantile(x,0.01)
    result['q05'] = np.quantile(x,0.05)
    result['q95'] = np.quantile(x,0.95)
    result['q99'] = np.quantile(x,0.99)
    
    return result
    

In [14]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')
index = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)
frames = []
for seg_id in tqdm_notebook(index.index):
        _ = preprocess(seg_id)
        frames.append(_)
X_test = pd.concat(frames)
X_test = X_test.set_index("seg_id")

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))




In [15]:
print(X_test.columns)
print(X_train.columns)

Index(['sma_10', 'ewma_10', 'ave', 'std', 'max', 'min', 'sma_3', 'ewma_3',
       'sma_5', 'ewma_5', 'sma_50', 'ewma_50', 'sma_100', 'ewma_100',
       'sma_3_last', 'ewma_3_last', 'sma_5_last', 'ewma_5_last', 'sma_10_last',
       'ewma_10_last', 'sma_50_last', 'ewma_50_last', 'sma_100_last',
       'ewma_100_last', 'q01', 'q05', 'q95', 'q99'],
      dtype='object')
Index(['sma_10', 'ewma_10', 'ave', 'std', 'max', 'min', 'time_to_failure',
       'ewma_3', 'sma_3', 'ewma_5', 'sma_5', 'ewma_50', 'sma_50', 'ewma_100',
       'sma_100', 'ewma_3_last', 'sma_3_last', 'ewma_5_last', 'sma_5_last',
       'ewma_10_last', 'sma_10_last', 'ewma_50_last', 'sma_50_last',
       'ewma_100_last', 'sma_100_last', 'q01', 'q05', 'q95', 'q99'],
      dtype='object')


In [16]:
print(X_test.head(10))
print(X_train.head(10))

                       sma_10            ewma_10  ...    q95   q99
seg_id                                            ...             
seg_00030f  4.491922848704198  4.491684314669440  ...   11.0  18.0
seg_0012b5  4.171169603509576  4.171217794543344  ...   11.0  20.0
seg_00184e  4.610255281983631  4.610334498190113  ...   11.0  20.0
seg_003339  4.531453887233320  4.531458551336902  ...   10.0  14.0
seg_0042cc  4.128355701342070  4.128344913932700  ...   10.0  19.0
seg_004314  4.148557580121454  4.148675666789552  ...   20.0  58.0
seg_004cd2  4.114267522717971  4.114147240723185  ...   10.0  15.0
seg_004ee5  4.328192358208076  4.328338470344701  ...   12.0  21.0
seg_004f1f  4.000696708469206  4.000707087581792  ...   11.0  20.0
seg_00648a  4.459079544772641  4.458783299047529  ...   12.0  26.0

[10 rows x 28 columns]
                   sma_10            ewma_10  ...    q95   q99
seg_id                                        ...             
1       4.884088378635989  4.884241616265863  

In [17]:
def train_model(X=X_train[train_features], X_test=X_test, y=X_train[TARGET], params=None, model_type='lgb', plot_feature_importance=False):
    
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
                    verbose=1000, early_stopping_rounds=200)
            
            y_pred_valid = model.predict(X_valid)
            #print(X_test.head())
            #print(train.head())
            
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
           
        if model_type == 'svr':
            scaler = StandardScaler()
            scaled_train = scaler.fit_transform(X)
            scaled_valid = scaler.transform(X_valid)
            scaled_test = scaler.transform(X_test)
            
            
            svm1 = NuSVR(nu=0.6, C=1.3, kernel='rbf', gamma=10, tol=0.01)
            svm1.fit(scaled_train, y)
            
            y_pred_valid = svm1.predict(scaled_valid).reshape(-1,)
            score = mean_absolute_error(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. MAE: {score:.4f}.')
            print('')
            
            y_pred = svm1.predict(scaled_test).reshape(-1,)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_absolute_error(y_valid, y_pred_valid) ** 0.5)
        
        prediction += y_pred    
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction
    
    else:
        return oof, prediction

In [None]:
#oof_lgb, prediction_svr, = train_model(params=None, model_type='svr')
oof_lgb, prediction_lgb, feature_importance = train_model(params=params, model_type='lgb', plot_feature_importance=True)

Fold 0 started at Tue Jan 15 17:44:50 2019
Training until validation scores don't improve for 200 rounds.




In [None]:
#print(submission.head())
print('LightGBM Predictions')
print(prediction_lgb)

In [None]:
# Ensembles with and single models of SVR do not perform well.
#print('Support Vector Regression Predictions')
#print(prediction_svr)

In [None]:
submission['time_to_failure'] = prediction_lgb
#submission['time_to_failure_2'] = prediction_svr

In [None]:
#submission['time_to_failure'] = (submission['time_to_failure_1'] + submission['time_to_failure_2']) / 2

In [None]:
submission.head()

In [None]:
#submission = submission.drop(labels=["time_to_failure_1", "time_to_failure_2"], axis=1)
submission.to_csv('submission.csv')