In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, RidgeCV
import gc
from catboost import CatBoostRegressor
import seaborn as sns

In [3]:
train = pd.read_csv('../input/train.csv', nrows = 1000000, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [4]:
train.head(20)

Unnamed: 0,acoustic_data,time_to_failure
0,12,1.4690999832
1,6,1.4690999821
2,8,1.469099981
3,5,1.4690999799
4,8,1.4690999788
5,8,1.4690999777
6,9,1.4690999766
7,7,1.4690999755
8,-5,1.4690999744
9,3,1.4690999733


In [5]:
train.describe()

Unnamed: 0,acoustic_data,time_to_failure
count,1000000.0,1000000.0
mean,4.876248,1.339787547164501
std,6.380823156725169,0.074985946080878
min,-199.0,1.2094993507
25%,2.0,1.274399508825
50%,5.0,1.33929966695
75%,7.0,1.404199825075
max,197.0,1.4690999832


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
acoustic_data      1000000 non-null int16
time_to_failure    1000000 non-null float64
dtypes: float64(1), int16(1)
memory usage: 9.5 MB


In [8]:
train.shape

(1000000, 2)

In [10]:
train.columns

Index(['acoustic_data', 'time_to_failure'], dtype='object')

In [12]:
train["sma"] = train["acoustic_data"].rolling(window = 5).mean()
train.head(10)

Unnamed: 0,acoustic_data,time_to_failure,sma
0,12,1.4690999832,
1,6,1.4690999821,
2,8,1.469099981,
3,5,1.4690999799,
4,8,1.4690999788,7.8
5,8,1.4690999777,7.0
6,9,1.4690999766,7.6
7,7,1.4690999755,7.4
8,-5,1.4690999744,5.4
9,3,1.4690999733,4.4


In [15]:
train["ewma"] = pd.Series.ewm(train['acoustic_data'], span=5).mean()
train.head(10)

Unnamed: 0,acoustic_data,time_to_failure,sma,ewma
0,12,1.4690999832,,12.0
1,6,1.4690999821,,8.4
2,8,1.469099981,,8.210526315789473
3,5,1.4690999799,,6.876923076923076
4,8,1.4690999788,7.8,7.308056872037914
5,8,1.4690999777,7.0,7.560902255639098
6,9,1.4690999766,7.6,8.070422535211268
7,7,1.4690999755,7.4,7.699127676447264
8,-5,1.4690999744,5.4,3.353033227270357
9,3,1.4690999733,4.4,3.233278759155537


In [23]:
def modeling_cross_validation(params, X, y, nr_folds=5):
    clfs = list()
    oof_preds = np.zeros(X.shape[0])
    # Split data with kfold
    kfolds =KFold(n_splits=nr_folds, shuffle=False, random_state=42)
    for n_fold, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        print("Fold {}".format(n_fold+1))
        
        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=200, eval_metric='mae',
            early_stopping_rounds=150
        )

        clfs.append(model)
        oof_preds[val_idx] = model.predict(X_valid, num_iteration=model.best_iteration_)
        
    score = (y, oof_preds)
    print(score)
    return clfs, score

In [18]:
def predict_cross_validation(test, clfs):
    sub_preds = np.zeros(test.shape[0])
    for i, model in enumerate(clfs, 1):    
        test_preds = model.predict_proba(test, num_iteration=model.best_iteration_)
        sub_preds += test_preds[:,1]

    sub_preds = sub_preds / len(clfs)
    ret = pd.Series(sub_preds, index=test.index)
    ret.index.name = test.index.name
    return ret


def predict_test_chunk(features, clfs, dtypes, filename='tmp.csv', chunks=100000):
    
    for i_c, df in enumerate(pd.read_csv('..test.csv', 
                                         chunksize=chunks, 
                                         dtype=dtypes, 
                                         iterator=True)):
        
        df.set_index(TARGET_INDEX, inplace=True)

        preds_df = predict_cross_validation(df[features], clfs)
        preds_df = preds_df.to_frame(TARGET)
        
        print("Writing test predictions to file")
        
        if i_c == 0:
            preds_df.to_csv(filename, header=True, mode='a', index=True)
        else:
            preds_df.to_csv(filename, header=False, mode='a', index=True)
        
        del preds_df
        gc.collect()
        print("Grabbin mode tests")
    print("Done")

In [25]:
params = {'num_leaves': 54,
         'min_data_in_leaf': 79,
         'objective': 'regression_l1',
         'max_depth': 15,
         'learning_rate': 0.018545526395058548,
         "boosting": "gbdt",
         "feature_fraction": 0.8354507676881442,
         "bagging_freq": 3,
         "bagging_fraction": 0.8126672064208567,
         "bagging_seed": 11,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         'min_child_weight': 5.343384366323818,
         'reg_alpha': 1.1302650970728192,
         'reg_lambda': 0.3603427518866501,
         'subsample': 0.8767547959893627,
         'num_iterations' : 2000}

In [26]:
train_features = list()

TARGET = 'time_to_failure'

train_features = [f for f in train.columns if f != TARGET]
    
clfs, score = modeling_cross_validation(params, train[train_features], train[TARGET], nr_folds=5)

Fold 1




Training until validation scores don't improve for 150 rounds.
[200]	valid_0's l1: 0.129811
[400]	valid_0's l1: 0.129808
Early stopping, best iteration is:
[276]	valid_0's l1: 0.129806
Fold 2




Training until validation scores don't improve for 150 rounds.
[200]	valid_0's l1: 0.0778798
Early stopping, best iteration is:
[117]	valid_0's l1: 0.0778672
Fold 3




Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[40]	valid_0's l1: 0.0208845
Fold 4




Training until validation scores don't improve for 150 rounds.
[200]	valid_0's l1: 0.0780595
Early stopping, best iteration is:
[75]	valid_0's l1: 0.0780409
Fold 5




Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[2]	valid_0's l1: 0.130012
(0         1.4690999832
1         1.4690999821
2         1.4690999810
3         1.4690999799
4         1.4690999788
5         1.4690999777
6         1.4690999766
7         1.4690999755
8         1.4690999744
9         1.4690999733
10        1.4690999722
11        1.4690999711
12        1.4690999700
13        1.4690999689
14        1.4690999678
15        1.4690999667
16        1.4690999656
17        1.4690999645
18        1.4690999634
19        1.4690999623
20        1.4690999612
21        1.4690999601
22        1.4690999590
23        1.4690999579
24        1.4690999568
25        1.4690999557
26        1.4690999546
27        1.4690999535
28        1.4690999524
29        1.4690999513
              ...     
999970    1.2094993826
999971    1.2094993815
999972    1.2094993804
999973    1.2094993793
999974    1.2094993782
999975    1.2094993771
999976    1.2094993760


Test Code below

In [90]:
def sma (x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    temp = (cumsum[N:] - cumsum[:-N]) / float(N)
    result = np.zeros(len(x) - len(temp))
    result = np.concatenate((result, temp))
    #result = np.reshape(result, (len(result),1))
    return result

In [102]:
def preprocess(seg_id):
    test = pd.read_csv('../input/test/' + seg_id + '.csv')

    test["sma"] = test["acoustic_data"].rolling(window = 5).mean()
    test['ewma'] = pd.Series.ewm(test["acoustic_data"], span=5).mean()
    test["seg_id"] = [seg_id] * test.shape[0]
    
    return test
    

In [103]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')
index = pd.DataFrame(columns=train.columns, dtype=np.float64, index=submission.index)
X_test = None
idx = 0
for seg_id in tqdm_notebook(index.index):
    if idx == 0:
        X_test = preprocess(seg_id)
    else:
        _ = preprocess(seg_id)
        frames = [X_test, _]
        X_test = pd.concat(frames)
        idx = 1
X_test = X_test.set_index("seg_id")

HBox(children=(IntProgress(value=0, max=2624), HTML(value='')))

In [104]:
X_test.head()

Unnamed: 0_level_0,acoustic_data,sma,ewma
seg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
seg_ffe7cc,6,,6.0
seg_ffe7cc,5,,5.399999999999999
seg_ffe7cc,5,,5.210526315789473
seg_ffe7cc,3,,4.292307692307691
seg_ffe7cc,1,4.0,3.028436018957346


In [105]:
def train_model(X=train[train_features], X_test=X_test, y=train[TARGET], params=None, model_type='lgb', plot_feature_importance=False):
    
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
                    verbose=1000, early_stopping_rounds=200)
            
            y_pred_valid = model.predict(X_valid)
            print(X_test.head())
            print(train.head())
            
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_absolute_error(y_valid, y_pred_valid) ** 0.5)
        
        prediction += y_pred    
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction
    
    else:
        return oof, prediction

In [None]:
oof_lgb, prediction_lgb, feature_importance = train_model(params=params, model_type='lgb', plot_feature_importance=True)

Fold 0 started at Sat Jan 12 01:15:21 2019




In [None]:
submission['time_to_failure'] = prediction_lgb
print(submission.head())
submission.to_csv('submission.csv')