# Feature Engineering Notebook

## Configure Mongodb path to get data

In [1]:
import numpy as np
import pandas as pd
import os
pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args
from pyspark.sql import SparkSession

spark = SparkSession \
   .builder \
   .appName("myApp") \
   .config("spark.mongodb.input.uri", "mongodb://18.221.66.227/project.small_data")\
   .config("spark.mongodb.output.uri", "mongodb://18.221.66.227/project.test")\
   .getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

In [2]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- acoustic_data: integer (nullable = true)
 |-- time_to_failure: double (nullable = true)



### Reading from local

In [4]:
data = np.array(pd.read_csv('small_train.csv'))

## Feature Engineering function

In [5]:
def calculate(x, lines=10000):
    f = list()
    for i in range(lines):
        row = np.random.randint(len(x)- 150000)
        data = filter(x, row)
        feature = dict()
        x_val = np.array([i[0] for i in data])
        feature['y_target'] = float(data[-1,1])
        feature['mean'] = float(x_val.mean())
        feature['std'] = float(x_val.std())
        feature['max1'] = float(x_val.max())
        feature['min1'] = float(x_val.min())
        feature['sum1'] = float(x_val.sum())
        feature['abs_max'] = float(np.abs(x_val).sum())
        feature['mean_change_abs'] = float(np.mean(np.diff(x_val)))
        feature['mean_change_rate'] = float(np.mean(np.nonzero((np.diff(x_val) / x_val[:-1]))[0]))
        feature['abs_max'] = float(np.abs(x_val).max())
        feature['abs_min'] = float(np.abs(x_val).min())
        feature['std_first_50000'] = float(x_val[:50000].std())
        feature['std_last_50000'] = float(x_val[-50000:].std())
        feature['std_first_10000'] = float(x_val[:10000].std())
        feature['std_last_10000'] = float(x_val[-10000:].std())
        feature['avg_first_50000'] = float(x_val[:50000].mean())
        feature['avg_last_50000'] = float(x_val[-50000:].mean())
        feature['avg_first_10000'] = float(x_val[:10000].mean())
        feature['avg_last_10000'] = float(x_val[-10000:].mean())
        feature['min_first_50000'] = float(x_val[:50000].min())
        feature['min_last_50000'] = float(x_val[-50000:].min())
        feature['min_first_10000'] = float(x_val[:10000].min())
        feature['min_last_10000'] = float(x_val[-10000:].min())
        feature['max_first_50000'] = float(x_val[:50000].max())
        feature['max_last_50000'] = float(x_val[-50000:].max())
        feature['max_first_10000'] = float(x_val[:10000].max())
        feature['max_last_10000'] = float(x_val[-10000:].max())
        feature['max_to_min'] = float(x_val.max() / np.abs(x_val.min()))
        feature['max_to_min_diff'] = float(x_val.max() - np.abs(x_val.min()))
        feature['count_big'] = float(len(x_val[np.abs(x_val) > 500]))
        feature['mean_change_rate_first_50000'] = float(np.mean(np.nonzero((np.diff(x_val[:50000]) / x_val[:50000][:-1]))[0]))
        feature['mean_change_rate_last_50000'] = float(np.mean(np.nonzero((np.diff(x_val[-50000:]) / x_val[-50000:][:-1]))[0]))
        feature['mean_change_rate_first_10000'] = float(np.mean(np.nonzero((np.diff(x_val[:10000]) / x_val[:10000][:-1]))[0]))
        feature['mean_change_rate_last_10000'] = float(np.mean(np.nonzero((np.diff(x_val[-10000:]) / x_val[-10000:][:-1]))[0]))

        feature['q70'] = float(np.quantile(x_val, 0.70)) 
        feature['q75'] = float(np.quantile(x_val, 0.75)) 
        feature['q60'] = float(np.quantile(x_val, 0.60))
        feature['q65'] = float(np.quantile(x_val, 0.65)) 
        feature['q85'] = float(np.quantile(x_val, 0.85))
        feature['q90'] = float(np.quantile(x_val, 0.90))
        feature['q80'] = float(np.quantile(x_val, 0.80))
        feature['q95'] = float(np.quantile(x_val, 0.95))
        feature['q99'] = float(np.quantile(x_val, 0.99))
        f.append(feature)
    return f

def filter(x, row_num):
    df_size = len(x)
    if (row_num > (df_size-150000)):
        return "Can't be computed it."
    else:
        return x[row_num: row_num+150000]

## Code to calculate features and push 10K data points in a go to Mongodb

In [1]:
for i in range(10):
    data = np.array(pd.read_csv('small_train.csv'))
    f = calculate(data)
    del data
    data = spark.sparkContext.parallelize(f).toDF()
    data.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

**Each batch of 10K data takes 13 min to process on local system**