In [1]:
from scipy.stats import skew
import os
import warnings
from pathlib import Path

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import plotly.express as px
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML

# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [2]:
preprocessed = Path("Data/processed/")

In [3]:
train_df = pd.read_parquet(preprocessed/"imputed_train.parquet")
test_df = pd.read_parquet(preprocessed/"imputed_test.parquet")
print(train_df.shape)
print(test_df.shape)
train_df.head()

(866, 28)
(134, 28)


Unnamed: 0,treatment_company,azimuth,md (ft),tvd (ft),operator,footage lateral length,well spacing,porpoise deviation,porpoise count,shale footage,...,s-velocity,youngs modulus,isip,breakdown pressure,pump rate,total number of stages,proppant volume,proppant fluid ratio,year,production
0,1,-14.43,14404,6437.0,1,7020.0,1152.576,34.4,9,0,...,6955.35,30.83,4491.0,6627.5,97,34,14222506.0,1.32,2016,3861.743
1,1,-16.32,10535,6035.0,1,4343.0,1852.01,6.96,3,0,...,7008.05,31.05,4176.0,7748.0,95,14,4437034.0,1.35,2013,512.233
2,8,-31.371,15111,6318.0,1,8372.0,2362.482,41.96,15,10042,...,6998.7,30.93,4650.647,6675.833,78,32,11923180.0,1.24,2013,1107.987
3,1,-48.48,12667,6714.0,1,5529.0,1052.818,10.64,3,2575,...,6877.28,30.66,4552.0,7625.25,99,36,10940004.0,1.23,2017,2943.454
4,1,-28.69,12606,6832.0,1,5121.0,902.851,46.2,6,0,...,7015.34,31.11,4939.0,7625.25,100,50,9865250.0,1.06,2017,2513.222


For the feature engineering part I will do some transformations only on the most significant variables shown in the EDA process for simplicity.

In [4]:
top_feats = ["md (ft)","footage lateral length","total number of stages", "proppant volume","youngs modulus","p-velocity"]
display(train_df[top_feats].describe().T)
train_df[top_feats].info()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
md (ft),866.0,13987.091,2606.818,8642.0,12174.75,13629.5,15414.0,27797.0
footage lateral length,866.0,6386.107,2533.968,1329.0,4666.0,5942.5,7710.25,19849.0
total number of stages,866.0,28.908,15.559,9.0,16.25,26.0,38.75,110.0
proppant volume,866.0,10541862.315,6234330.271,176096.0,6245604.75,9082286.0,13499181.75,41950368.0
youngs modulus,866.0,30.266,1.485,25.64,29.648,30.84,31.13,33.1
p-velocity,866.0,12761.496,711.953,10631.34,12199.06,13154.825,13273.49,13978.78


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   md (ft)                 866 non-null    int64  
 1   footage lateral length  866 non-null    float64
 2   total number of stages  866 non-null    int64  
 3   proppant volume         866 non-null    float64
 4   youngs modulus          866 non-null    float64
 5   p-velocity              866 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 40.7 KB


In [5]:
def create_year_operation_aggregations(data, feat):
    data_frame = data.copy()
     # group base features
    gp_year_op = (
        data_frame.sort_values("year")
        .groupby(["year","operator"], as_index=False)
        .agg({feat: [np.min, np.max, np.std, skew]})
    )

    gp_year_op.columns = [
        "year",
        "operator",
        f"min_year_op_{feat}",
        f"max_year_op_{feat}",
        f"std_year_op_{feat}",
        f"skew_year_op_{feat}"
        
    ]
    gp_year_op = gp_year_op.fillna(0)
    data_frame = pd.merge(data_frame, gp_year_op, on=["year","operator"], how="left")
    return data_frame

def create_year_aggregations(data, feat):
    data_frame = data.copy()
     # group base features
    gp_year = (
        data_frame.sort_values("year")
        .groupby(["year"], as_index=False)
        .agg({feat: [np.min, np.max, np.std, skew]})
    )

    gp_year.columns = [
        "year",
        f"min_year_{feat}",
        f"max_year_{feat}",
        f"std_year_{feat}",
        f"skew_year_{feat}"
        
    ]
    gp_year = gp_year.fillna(0)
    data_frame = pd.merge(data_frame, gp_year, on=["year"], how="left")
    return data_frame

In [6]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    for feat in top_feats:
        train_df = create_year_operation_aggregations(train_df, feat)
        test_df = create_year_operation_aggregations(test_df, feat)
        #train_df = create_year_aggregations(train_df, feat)
        #test_df = create_year_aggregations(test_df, feat)

print(train_df.shape)
print(test_df.shape)
train_df.head()

(866, 52)
(134, 52)


Unnamed: 0,treatment_company,azimuth,md (ft),tvd (ft),operator,footage lateral length,well spacing,porpoise deviation,porpoise count,shale footage,...,std_year_op_proppant volume,skew_year_op_proppant volume,min_year_op_youngs modulus,max_year_op_youngs modulus,std_year_op_youngs modulus,skew_year_op_youngs modulus,min_year_op_p-velocity,max_year_op_p-velocity,std_year_op_p-velocity,skew_year_op_p-velocity
0,1,-14.43,14404,6437.0,1,7020.0,1152.576,34.4,9,0,...,4264601.167,0.499,28.12,31.17,1.397,0.017,12290.63,13595.4,546.589,0.335
1,1,-16.32,10535,6035.0,1,4343.0,1852.01,6.96,3,0,...,1633163.174,1.734,25.93,31.07,2.142,-0.512,12070.11,13593.93,536.128,-0.843
2,8,-31.371,15111,6318.0,1,8372.0,2362.482,41.96,15,10042,...,1633163.174,1.734,25.93,31.07,2.142,-0.512,12070.11,13593.93,536.128,-0.843
3,1,-48.48,12667,6714.0,1,5529.0,1052.818,10.64,3,2575,...,4406462.698,2.322,28.31,31.14,1.08,-1.225,12230.7,13591.62,412.831,-0.994
4,1,-28.69,12606,6832.0,1,5121.0,902.851,46.2,6,0,...,4406462.698,2.322,28.31,31.14,1.08,-1.225,12230.7,13591.62,412.831,-0.994


# Save to disk

In [7]:
train_df.to_parquet("Data/processed/engineered_train.parquet")
test_df.to_parquet("Data/processed/engineered_test.parquet")