# LANL Earthquake Prediction

## Preparation

In [1]:
DATA_DIR = 'data/'
TRAIN_FILE = 'train/train.csv'

CACHE_DIR = 'cache/'
SEGMENTS_FILE = 'segments.csv'

TRAIN_ROWS = None
SEGMENT_ROWS = 50000

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
pd.options.display.float_format = '{:,.10f}'.format

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
from utils.file

train_path = '{data_dir}{train_file}'.format(data_dir=DATA_DIR, train_file=TRAIN_FILE)
data_type = {'acoustic_data': np.int16, 'time_to_failure': np.float64}

print('Reading file:{file_path} into dataframe:'.format(file_path=train_path))
data = pd.read_csv(train_path,
                   nrows=TRAIN_ROWS,
                   dtype=data_type)

Reading file:data/train/train.csv into dataframe:


In [4]:
data.columns=['x','y']
data['id'] = 1
data.head(1)

Unnamed: 0,x,y,id
0,12,1.4690999832,1


In [5]:
segments = int(np.floor(data.shape[0] / SEGMENT_ROWS))

In [6]:
# X_train = pd.DataFrame(dtype=np.float64)
# y_train = pd.DataFrame(dtype=np.float64)

In [7]:
from tsfresh import extract_features

def extract_tf_features(tupl):
    segment,seg = tupl[0],tupl[1]
    print('extract_tf_features.call.segment={segment}'.format(segment=segment))
    
    x = seg[['id','x']]   # pd series
    y = seg['y'].values[-1]  # single value
    
    extracted_features = extract_features(x, column_id='id')
    
    X_train = pd.DataFrame(dtype=np.float64)
    y_train = pd.DataFrame(dtype=np.float64)
    
    X_train = X_train.append(extracted_features)
    y_train.loc[segment, 'y'] = y
    
    X_train['y'] = y
    X_train.to_csv(segments_path+str(segment))
    
    print('extract_tf_features.success.segment={segment}'.format(segment=segment))
    
    del seg
    del x
    del y
    del extracted_features
    del X_train
    del y_train

In [None]:
import multiprocessing

num_procs = multiprocessing.cpu_count() - 2
num_procs

6

In [None]:
from multiprocessing.dummy import Pool
pool = Pool(num_procs)

pool.map(extract_tf_features, [(segment, data.iloc[segment*SEGMENT_ROWS:segment*SEGMENT_ROWS+SEGMENT_ROWS]) for segment in range(segments)])
# for segment in tqdm(range(segments)):
#     seg = data.iloc[segment*SEGMENT_ROWS:segment*SEGMENT_ROWS+SEGMENT_ROWS]
#     x = seg[['id','x']]   # pd series
#     y = seg['y'].values[-1]  # single value
    
#     extracted_features = extract_features(x, column_id='id')
    
#     X_train = X_train.append(extracted_features)
#     y_train.loc[segment, 'y'] = y
    
#     X_train['y'] = y
#     X_train.to_csv(segments_path+str(segment))

extract_tf_features.call.segment=0
extract_tf_features.call.segment=525
extract_tf_features.call.segment=1050
extract_tf_features.call.segment=1575
extract_tf_features.call.segment=2100
extract_tf_features.call.segment=2625


Feature Extraction:   0%|          | 0/1 [00:00<?, ?it/s]
Feature Extraction:   0%|          | 0/1 [00:00<?, ?it/s][A

Feature Extraction:   0%|          | 0/1 [00:00<?, ?it/s][A[A


Feature Extraction:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A



Feature Extraction:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A




Feature Extraction:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A[A

In [None]:
# segments_path = '{cache_dir}{segments_file}'.format(cache_dir=CACHE_DIR,segments_file=SEGMENTS_FILE)

# X_train['y']=y_train['y']
# X_train.to_csv(segments_path)