In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, RidgeCV
import gc
from catboost import CatBoostRegressor
import seaborn as sns

rows = 150_000
segments = int(np.floor(train.shape[0] / rows))


    
train["sma"] = train["acoustic_data"].rolling(window = 5).mean()
train["sma"] = train["acoustic_data"].rolling(window = 5).mean()
train["ewma"] = pd.Series.ewm(train['acoustic_data'], span=5).mean()
x = train['acoustic_data'].values
train['ave'] = x.mean()
train['std'] = x.std()
train['max'] = x.max()
train['min'] = x.min()

X_train = train.drop(labels="acoustic_data", axis=1)

In [None]:
def preprocess_train(seg_id, train):
    d = {'sma' : [0],
         'ewma' : [0],
         'ave' : [0],
         'std' : [0],
         'max' : [0],
         'min' : [0],
         'time_to_failure' : [0]}
    result = pd.DataFrame(d, dtype=np.float64)
    
    x = train["acoustic_data"].values
    result['time_to_failure'] = train["time_to_failure"].values[-1]
    train["sma"] = train["acoustic_data"].rolling(window = 10).mean()
    train['ewma'] = pd.Series.ewm(train["acoustic_data"], span=10).mean()
    result["sma"] = train["sma"].mean()
    result["seg_id"] = seg_id
    result["ewma"] = train["ewma"].mean()
    result['ave'] = 1
    result['std'] = x.std()
    result['max'] = x.max()
    result['min'] = x.min()
    

    return result

In [None]:
frames = []
rows = 150_000
segments = 1
#train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

for df in pd.read_csv('../input/train.csv', chunksize=rows, dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}):
    _ = preprocess_train(segments, df)
    frames.append(_)
    segments += 1

X_train = pd.concat(frames)
X_train = X_train.set_index("seg_id")

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
def modeling_cross_validation(params, X, y, nr_folds=5):
    clfs = list()
    oof_preds = np.zeros(X.shape[0])
    # Split data with kfold
    kfolds =KFold(n_splits=nr_folds, shuffle=False, random_state=42)
    for n_fold, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
        X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

        print("Fold {}".format(n_fold+1))
        
        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=200, eval_metric='mae',
            early_stopping_rounds=150
        )

        clfs.append(model)
        oof_preds[val_idx] = model.predict(X_valid, num_iteration=model.best_iteration_)
        
    score = (y, oof_preds)
    print(score)
    return clfs, score

In [None]:
def predict_cross_validation(test, clfs):
    sub_preds = np.zeros(test.shape[0])
    for i, model in enumerate(clfs, 1):    
        test_preds = model.predict_proba(test, num_iteration=model.best_iteration_)
        sub_preds += test_preds[:,1]

    sub_preds = sub_preds / len(clfs)
    ret = pd.Series(sub_preds, index=test.index)
    ret.index.name = test.index.name
    return ret


def predict_test_chunk(features, clfs, dtypes, filename='tmp.csv', chunks=100000):
    
    for i_c, df in enumerate(pd.read_csv('..test.csv', 
                                         chunksize=chunks, 
                                         dtype=dtypes, 
                                         iterator=True)):
        
        df.set_index(TARGET_INDEX, inplace=True)

        preds_df = predict_cross_validation(df[features], clfs)
        preds_df = preds_df.to_frame(TARGET)
        
        print("Writing test predictions to file")
        
        if i_c == 0:
            preds_df.to_csv(filename, header=True, mode='a', index=True)
        else:
            preds_df.to_csv(filename, header=False, mode='a', index=True)
        
        del preds_df
        gc.collect()
        print("Grabbin mode tests")
    print("Done")

In [None]:
params = {'num_leaves': 54,
         'min_data_in_leaf': 79,
         'objective': 'regression_l1',
         'max_depth': 15,
         'learning_rate': 0.018545526395058548,
         "boosting": "gbdt",
         "feature_fraction": 0.8354507676881442,
         "bagging_freq": 3,
         "bagging_fraction": 0.8126672064208567,
         "bagging_seed": 11,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1,
         'min_child_weight': 5.343384366323818,
         'reg_alpha': 1.1302650970728192,
         'reg_lambda': 0.3603427518866501,
         'subsample': 0.8767547959893627,
         'num_iterations' : 2000}

In [None]:
train_features = list()

TARGET = 'time_to_failure'

train_features = [f for f in X_train.columns if f != TARGET]
    


clfs, score = modeling_cross_validation(params, X_train[train_features], X_train[TARGET], nr_folds=5)

Test Code below

In [None]:
def sma (x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    temp = (cumsum[N:] - cumsum[:-N]) / float(N)
    result = np.zeros(len(x) - len(temp))
    result = np.concatenate((result, temp))
    #result = np.reshape(result, (len(result),1))
    return result

In [None]:
def preprocess(seg_id):
    d = {'sma' : [0],
         'ewma' : [0],
         'ave' : [0],
         'std' : [0],
         'max' : [0],
         'min' : [0]}
    test = pd.read_csv('../input/test/' + seg_id + '.csv')
    result = pd.DataFrame(d, dtype=np.float64)
    
    x = test["acoustic_data"].values
    
    test["sma"] = test["acoustic_data"].rolling(window = 5).mean()
    test['ewma'] = pd.Series.ewm(test["acoustic_data"], span=5).mean()
    result["sma"] = test["sma"].mean()
    result["seg_id"] = seg_id
    result["ewma"] = test["ewma"].mean()
    result['ave'] = 1
    result['std'] = x.std()
    result['max'] = x.max()
    result['min'] = x.min()
    #print(test)
    #print(result)
    return result
    

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')
index = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)
frames = []
for seg_id in tqdm_notebook(index.index):
        _ = preprocess(seg_id)
        frames.append(_)
X_test = pd.concat(frames)
X_test = X_test.set_index("seg_id")

In [None]:
print(X_test.columns)
print(X_train.columns)

In [None]:
print(X_test.head(10))
print(X_train.head(10))

In [None]:
def train_model(X=X_train[train_features], X_test=X_test, y=X_train[TARGET], params=None, model_type='lgb', plot_feature_importance=False):
    
    n_fold = 5
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
            model.fit(X_train, y_train, 
                    eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
                    verbose=1000, early_stopping_rounds=200)
            
            y_pred_valid = model.predict(X_valid)
            #print(X_test.head())
            #print(train.head())
            
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_absolute_error(y_valid, y_pred_valid) ** 0.5)
        
        prediction += y_pred    
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction
    
    else:
        return oof, prediction

In [None]:
oof_lgb, prediction_lgb, feature_importance = train_model(params=params, model_type='lgb', plot_feature_importance=True)

In [None]:
#print(submission.head())
#print(submission.shape)
print(prediction_lgb)

In [None]:
submission['time_to_failure'] = prediction_lgb
print(submission.head())
submission.to_csv('submission.csv')