In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
df = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/processed/cleaned_data.csv")
df.head()

Unnamed: 0,Total_Accidents,Total_Release_BBLS,Total_Recovered_BBLS,Avg_Prop_Damage,Avg_Env_Damage,Total_Fatalities,Total_Injuries,Operational_Rigs,Distillation_Capacity,Refinery_Inputs,%_Utilization_of_Refinery_Operable_Capacity,Date
0,34,7562.18,2590.44,16872.529412,236234.205882,0,0,433.0,17597.0,14065.0,79.9,2010-01-01
1,16,13791.04,29.95,78075.0625,15665.5,0,0,446.0,17584.0,14267.0,81.1,2010-02-01
2,32,3663.14,2689.68,79171.84375,30998.28125,0,0,471.0,17584.0,14630.0,83.2,2010-03-01
3,32,3054.18,392.01,104276.6875,148810.875,0,1,508.0,17589.0,15592.0,88.7,2010-04-01
4,30,5102.98,2841.72,949891.0,75947.833333,0,0,541.0,17589.0,15510.0,88.2,2010-05-01


## Creating the Strain Index Column

### Strain_Index = 1 - Refinery_Inputs/Distillation Capacity

In [3]:
#Creating the target Variable (Strain_Index)

df["Strain_Index"] = 1 - (df["Refinery_Inputs"] / df["Distillation_Capacity"])


In [4]:
#Creating Lagged Features for previous month, 3 months ago, 6 months ago and 12 months ago

engineered_df = df.copy(deep=True)

for col in ["Strain_Index", "Total_Accidents", "Operational_Rigs"]:
    for lag in [1,3,6,12]:
        engineered_df[f"{col}_lag{lag}"] = engineered_df[col].shift(lag)

In [5]:
#Creating rolling window features for 3, 6 and 12 months

for col in ["Strain_Index", "Total_Accidents", "Total_Release_BBLS", "Operational_Rigs"]:
    for w in [3,6,12]:
        engineered_df[f"{col}_roll{w}_mean"] = engineered_df[col].shift(1).rolling(w).mean()
        engineered_df[f"{col}_roll{w}_sum"] = engineered_df[col].shift(1).rolling(w).sum()


In [6]:
#Creating the new target column (Strain_Index + 1) meaning next month strain_index

engineered_df["target_strain_index"] = engineered_df["Strain_Index"].shift(-1)


In [7]:
engineered_df.head(10)

Unnamed: 0,Total_Accidents,Total_Release_BBLS,Total_Recovered_BBLS,Avg_Prop_Damage,Avg_Env_Damage,Total_Fatalities,Total_Injuries,Operational_Rigs,Distillation_Capacity,Refinery_Inputs,%_Utilization_of_Refinery_Operable_Capacity,Date,Strain_Index,Strain_Index_lag1,Strain_Index_lag3,Strain_Index_lag6,Strain_Index_lag12,Total_Accidents_lag1,Total_Accidents_lag3,Total_Accidents_lag6,Total_Accidents_lag12,Operational_Rigs_lag1,Operational_Rigs_lag3,Operational_Rigs_lag6,Operational_Rigs_lag12,Strain_Index_roll3_mean,Strain_Index_roll3_sum,Strain_Index_roll6_mean,Strain_Index_roll6_sum,Strain_Index_roll12_mean,Strain_Index_roll12_sum,Total_Accidents_roll3_mean,Total_Accidents_roll3_sum,Total_Accidents_roll6_mean,Total_Accidents_roll6_sum,Total_Accidents_roll12_mean,Total_Accidents_roll12_sum,Total_Release_BBLS_roll3_mean,Total_Release_BBLS_roll3_sum,Total_Release_BBLS_roll6_mean,Total_Release_BBLS_roll6_sum,Total_Release_BBLS_roll12_mean,Total_Release_BBLS_roll12_sum,Operational_Rigs_roll3_mean,Operational_Rigs_roll3_sum,Operational_Rigs_roll6_mean,Operational_Rigs_roll6_sum,Operational_Rigs_roll12_mean,Operational_Rigs_roll12_sum,target_strain_index
0,34,7562.18,2590.44,16872.529412,236234.2,0,0,433.0,17597.0,14065.0,79.9,2010-01-01,0.200716,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.188637
1,16,13791.04,29.95,78075.0625,15665.5,0,0,446.0,17584.0,14267.0,81.1,2010-02-01,0.188637,0.200716,,,,34.0,,,,433.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.167994
2,32,3663.14,2689.68,79171.84375,30998.28,0,0,471.0,17584.0,14630.0,83.2,2010-03-01,0.167994,0.188637,,,,16.0,,,,446.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.113537
3,32,3054.18,392.01,104276.6875,148810.9,0,1,508.0,17589.0,15592.0,88.7,2010-04-01,0.113537,0.167994,0.200716,,,32.0,34.0,,,471.0,433.0,,,0.185782,0.557347,,,,,27.333333,82.0,,,,,8338.786667,25016.36,,,,,450.0,1350.0,,,,,0.118199
4,30,5102.98,2841.72,949891.0,75947.83,0,0,541.0,17589.0,15510.0,88.2,2010-05-01,0.118199,0.113537,0.188637,,,32.0,16.0,,,508.0,446.0,,,0.156723,0.470168,,,,,26.666667,80.0,,,,,6836.12,20508.36,,,,,475.0,1425.0,,,,,0.093021
5,29,6210.06,1475.72,25189.896552,499956.2,0,0,566.0,17523.0,15893.0,90.7,2010-06-01,0.093021,0.118199,0.167994,,,30.0,32.0,,,541.0,471.0,,,0.133243,0.399729,,,,,31.333333,94.0,,,,,3940.1,11820.3,,,,,506.666667,1520.0,,,,,0.088326
6,29,23690.21,18651.67,133785.241379,21931820.0,1,1,591.0,17594.0,16040.0,91.2,2010-07-01,0.088326,0.093021,0.113537,0.200716,,29.0,32.0,34.0,,566.0,508.0,433.0,,0.108252,0.324756,0.147017,0.882103,,,30.333333,91.0,28.833333,173.0,,,4789.073333,14367.22,6563.93,39383.58,,,538.333333,1615.0,494.166667,2965.0,,,0.10873
7,27,4674.29,1148.4,265007.925926,193617.8,0,0,644.0,17594.0,15681.0,89.1,2010-08-01,0.10873,0.088326,0.118199,0.188637,,29.0,30.0,16.0,,591.0,541.0,446.0,,0.099848,0.299545,0.128285,0.769713,,,29.333333,88.0,28.0,168.0,,,11667.75,35003.25,9251.935,55511.61,,,566.0,1698.0,520.5,3123.0,,,0.13533
8,29,8584.16,7893.32,112700.793103,584097.2,0,0,668.0,17594.0,15213.0,86.5,2010-09-01,0.13533,0.10873,0.093021,0.167994,,27.0,29.0,32.0,,644.0,566.0,471.0,,0.096692,0.290076,0.114968,0.689806,,,28.333333,85.0,29.833333,179.0,,,11524.853333,34574.56,7732.476667,46394.86,,,600.333333,1801.0,553.5,3321.0,,,0.174692
9,10,10619.68,10035.48,5110.0,34979.9,0,0,693.0,17528.0,14466.0,82.5,2010-10-01,0.174692,0.13533,0.088326,0.113537,,29.0,29.0,32.0,,668.0,591.0,508.0,,0.110795,0.332386,0.109524,0.657142,,,28.333333,85.0,29.333333,176.0,,,12316.22,36948.66,8552.646667,51315.88,,,634.333333,1903.0,586.333333,3518.0,,,0.134756


In [8]:
#Dropping all NaNs

engineered_df = engineered_df.dropna().reset_index(drop=True)

In [9]:
#Creating our train, Evaluation and Holdout datasets

engineered_df = engineered_df.sort_values("Date").reset_index(drop=True)

#2010-2020
train_df = engineered_df[engineered_df["Date"] < "2021-01-01"]

#2020-2023
eval_df = engineered_df[(engineered_df["Date"] >= "2021-01-01") & (engineered_df["Date"] < "2024-01-01")]

#2024-2025
holdout_df = engineered_df[engineered_df["Date"] >= "2024-01-01"]



In [10]:
#Saving final datasets
import os

data_path = os.path.join("..", "data", "final_datasets")
os.makedirs(data_path, exist_ok=True)

datasets = {
    "train_df": train_df,
    "eval_df": eval_df,
    "holdout_df": holdout_df
}

for name, df in datasets.items():
    save_path = os.path.join(data_path, f"{name}.csv")
    df.to_csv(save_path, index=False)
    print(f"Saved {name}.csv to {save_path}")
print("All datasets saved")

Saved train_df.csv to ../data/final_datasets/train_df.csv
Saved eval_df.csv to ../data/final_datasets/eval_df.csv
Saved holdout_df.csv to ../data/final_datasets/holdout_df.csv
All datasets saved
