In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math 

import matplotlib.pyplot as plt
from sklearn.metrics import auc, accuracy_score, confusion_matrix
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from glob import glob
import xgboost as xgb
from xgboost import XGBRFRegressor 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

In [None]:
train = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/train.csv")
sample_submission = pd.read_csv("../input/predict-volcanic-eruptions-ingv-oe/sample_submission.csv")

* * # **Feature Engineering** adapted from INGV - Volcanic Eruption Prediction. EDA. Modeling

In [3]:
train_frags = glob("../input/predict-volcanic-eruptions-ingv-oe/train/*")
test_frags = glob("../input/predict-volcanic-eruptions-ingv-oe/test/*")
check = pd.read_csv('../input/predict-volcanic-eruptions-ingv-oe/train/2037160701.csv')
check

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10
0,-343.0,-110.0,-274.0,367.0,137.0,-106.0,-578.0,339.0,-506.0,1389.0
1,-298.0,-92.0,-159.0,288.0,76.0,-251.0,-713.0,352.0,-407.0,1451.0
2,-503.0,195.0,-140.0,266.0,-100.0,-162.0,-450.0,443.0,-406.0,1295.0
3,-153.0,-68.0,-78.0,301.0,-143.0,-86.0,-378.0,66.0,-472.0,1127.0
4,-320.0,-348.0,-27.0,283.0,-26.0,-255.0,-595.0,-127.0,-243.0,950.0
...,...,...,...,...,...,...,...,...,...,...
59996,329.0,-1.0,573.0,155.0,76.0,108.0,-727.0,-408.0,-232.0,1015.0
59997,-125.0,32.0,599.0,243.0,99.0,105.0,-788.0,-772.0,-239.0,951.0
59998,403.0,17.0,555.0,360.0,98.0,-66.0,-773.0,-888.0,-270.0,732.0
59999,-212.0,-92.0,548.0,163.0,71.0,-13.0,-207.0,-1237.0,-127.0,790.0


In [4]:
sensors = set()
observations = set()
nan_columns = list()
missed_groups = list()
for_df = list()

for item in train_frags:
    name = int(item.split('.')[-2].split('/')[-1])
    at_least_one_missed = 0
    frag = pd.read_csv(item)
    missed_group = list()
    missed_percents = list()
    for col in frag.columns:
        missed_percents.append(frag[col].isnull().sum() / len(frag))
        if pd.isnull(frag[col]).all() == True:
            at_least_one_missed = 1
            nan_columns.append(col)
            missed_group.append(col)
    if len(missed_group) > 0:
        missed_groups.append(missed_group)
    sensors.add(len(frag.columns))
    observations.add(len(frag))
    for_df.append([name, at_least_one_missed] + missed_percents)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
absent_df = pd.DataFrame(absent_groups.items(), columns=['Group', 'Missed number'])
absent_df = absent_df.sort_values('Missed number')

plt.figure(figsize=(8, 6))
sns.set_style("ticks")
sns.set_context("paper", font_scale = 0.75)
sns.barplot(x = absent_df['Group'], y = absent_df['Missed number'])
plt.title("Number of Missed Sensor Groups in Training Dataset")

In [None]:
for_df = pd.DataFrame(
    for_df, 
    columns=[
        'segment_id', 'has_missed_sensors', 'missed_percent_sensor1', 
        'missed_percent_sensor2', 'missed_percent_sensor3', 'missed_percent_sensor4', 
        'missed_percent_sensor5', 'missed_percent_sensor6', 'missed_percent_sensor7', 
        'missed_percent_sensor8', 'missed_percent_sensor9', 'missed_percent_sensor10'
    ]
)

for_df

In [None]:
train = pd.merge(train, for_df)
train

In [None]:
def build_features(signal, ts, sensor_id):
    X = pd.DataFrame()
    f = np.fft.fft(signal)
    f_real = np.real(f)
    X.loc[ts, f'{sensor_id}_sum']       = signal.sum()
    X.loc[ts, f'{sensor_id}_mean']      = signal.mean()
    X.loc[ts, f'{sensor_id}_std']       = signal.std()
    X.loc[ts, f'{sensor_id}_var']       = signal.var() 
    X.loc[ts, f'{sensor_id}_max']       = signal.max()
    X.loc[ts, f'{sensor_id}_min']       = signal.min()
    X.loc[ts, f'{sensor_id}_skew']      = signal.skew()
    X.loc[ts, f'{sensor_id}_mad']       = signal.mad()
    X.loc[ts, f'{sensor_id}_kurtosis']  = signal.kurtosis()
    X.loc[ts, f'{sensor_id}_quantile99']= np.quantile(signal, 0.99)
    X.loc[ts, f'{sensor_id}_quantile95']= np.quantile(signal, 0.95)
    X.loc[ts, f'{sensor_id}_quantile85']= np.quantile(signal, 0.85)
    X.loc[ts, f'{sensor_id}_quantile75']= np.quantile(signal, 0.75)
    X.loc[ts, f'{sensor_id}_quantile55']= np.quantile(signal, 0.55)
    X.loc[ts, f'{sensor_id}_quantile45']= np.quantile(signal, 0.45) 
    X.loc[ts, f'{sensor_id}_quantile25']= np.quantile(signal, 0.25) 
    X.loc[ts, f'{sensor_id}_quantile15']= np.quantile(signal, 0.15) 
    X.loc[ts, f'{sensor_id}_quantile05']= np.quantile(signal, 0.05)
    X.loc[ts, f'{sensor_id}_quantile01']= np.quantile(signal, 0.01)
    X.loc[ts, f'{sensor_id}_fft_real_mean']= f_real.mean()
    X.loc[ts, f'{sensor_id}_fft_real_std'] = f_real.std()
    X.loc[ts, f'{sensor_id}_fft_real_max'] = f_real.max()
    X.loc[ts, f'{sensor_id}_fft_real_min'] = f_real.min()

    return X

In [None]:
train_set = list()
j=0
for seg in train.segment_id:
    signals = pd.read_csv(f'/kaggle/input/predict-volcanic-eruptions-ingv-oe/train/{seg}.csv')
    train_row = []
    if j%500 == 0:
        print(j)
    for i in range(0, 10):
        sensor_id = f'sensor_{i+1}'
        train_row.append(build_features(signals[sensor_id].fillna(0), seg, sensor_id))
    train_row = pd.concat(train_row, axis=1)
    train_set.append(train_row)
    j+=1

train_set = pd.concat(train_set)

In [None]:
plt.figure(figsize=(16,6))
sample_set = train_set.iloc[:5, :20]
sns.set_context("paper", font_scale = 0.75)
sns.heatmap(data = sample_set, annot = True)

In [None]:
train_set = train_set.reset_index()
train_set = train_set.rename(columns={'index': 'segment_id'})
train_set = pd.merge(train_set, train, on='segment_id')
train_set

In [None]:
drop_cols = list()
for col in train_set.columns:
    if col == 'segment_id':
        continue
    if abs(train_set[col].corr(train_set['time_to_eruption'])) < 0.01:
        drop_cols.append(col)

In [None]:
not_to_drop_cols = list()

for col1 in train_set.columns:
    for col2 in train_set.columns:
        if col1 == col2:
            continue
        if col1 == 'segment_id' or col2 == 'segment_id': 
            continue
        if col1 == 'time_to_eruption' or col2 == 'time_to_eruption':
            continue
        if abs(train_set[col1].corr(train_set[col2])) > 0.98:
            if col2 not in drop_cols and col1 not in not_to_drop_cols:
                drop_cols.append(col2)
                not_to_drop_cols.append(col1)

In [None]:
train = train_set.drop(['segment_id', 'time_to_eruption'], axis=1)
y = train_set['time_to_eruption']

In [None]:
reduced_y = y.copy()
reduced_train = train.copy()
reduced_train = reduced_train.drop(drop_cols, axis=1)
reduced_train

In [None]:
dataset = pd.concat([reduced_train, reduced_y], axis=1)

# XGBoost!

In [None]:
def test_models_cs(): 
    models_cs = dict
    for c in np.arrange(0.05, 0.1, 0.01)
        key = 
        models_cs[key] = xgb.XGBRFRegressor(n_estimators = 500,
                           learning_rate = 0.05, 
                           reg_lambda = 0.1,
                           eval_metric = mean_absolute_error, 
                           subsample = 0.9, 
                           colsample_bynode = c)
    return models_cs

def test_models_ne(): 
    models_ne = dict
    for n in arrange(100, 1000, 100)
        key = 
        models_ne[key] = xgb.XGBRFRegressor(n_estimators = n,
                           learning_rate = 0.05, 
                           reg_lambda = 0.1,
                           eval_metric = mean_absolute_error, 
                           subsample = 0.9, 
                           colsample_bynode = 0.1)
    return models_ne

def test_models_lr(): 
    models_lr = dict()
    for l in arrange(0.05, 0.2, 0.05)
        key = 
        models_lr[key] = xgb.XGBRFRegressor(n_estimators = 500,
                           learning_rate = l, 
                           reg_lambda = 0.1,
                           eval_metric = mean_absolute_error, 
                           subsample = 0.9, 
                           colsample_bynode = 0.1)
    return models_lr
    
def model_eval(model, X, y): 
    cross_val = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3)
    scores = cross_val_score(model, X, y, scoring = 'accuracy', cv = cross_val)
    return scores
    
models_cs = test_models_cs()
results_cs, names_cs = list(), list()
for name, model in models_cs.items(): 
    scores_cs = model_eval(model, reduced_train, reduced_y)
    results_cs.append(scores_cs)
    names_cs.append(name)

models_ne = test_models_ne()
results_ne, names_ne = list(), list()
for name, model in models_ne.items(): 
    scores_ne = model_eval(model, reduced_train, reduced_y)
    results_ne.append(scores_ne)
    names_ne.append(name)
    
models_lr = test_models_ne()
results_lr, names_lr = list(), list()
for name, model in models_lr.items(): 
    scores_lr = model_eval(model, reduced_train, reduced_y)
    results_lr.append(scores_lr)
    names_lr.append(name)

In [None]:
print('>%s %.3f (%.3f)' % (name, mean(scores_cs), std(scores_cs)))
print('>%s %.3f (%.3f)' % (name, mean(scores_ne), std(scores_ne)))
print('>%s %.3f (%.3f)' % (name, mean(scores_lr), std(scores_lr)))

In [None]:
fig = plt.figure(figsize = (8,8))
gs = fig.add_gridspec(2,2)

fig.add_suplot(gs[0,0])
sns.scatterplot(x = names_cs, y = results_cs)
sns.regplot(x = names_cs, y = results_cs)
plt.title("Scores by Colsample Value")

fig.add_suplot(gs[0,1])
sns.scatterplot(x = names_ne, y = results_ne)
sns.regplot(x = names_ne, y = results_ne)
plt.title("Scores by Num Estimators")

fig.add_suplot(gs[1,0])
sns.scatterplot(x = names_lr, y = results_lr)
sns.regplot(x = names_lr, y = results_lr)
plt.title("Scores by Learning Rate")