# Edits a CSV, adding Gaussian estimates for XGBoost
Creates a column to the right using the oil data column and the xy coordinates of the training points.
Uses an external file that specifies the training indexes.

In [1]:
from google.colab import drive
import pandas as pd
import numpy as np

drive.mount('/content/drive')
# Load the training and testing data
df = pd.read_csv('/content/drive/MyDrive/Datathon 2024/Data/preprocessed_training.csv')

# Gets the testing and training indices from the csv in the drive
training_indices = np.array(pd.read_csv('/content/drive/MyDrive/Datathon 2024/Data/StandardSplit.csv')["train_test_split"])
training_indices -= 1

# Saving the target variable standard deviation and mean before normalization to convert the data back later
oil_std = df["OilPeakRate"].std()
oil_mean = df["OilPeakRate"].mean()
coord_std = (df["surface_x"].std() + df["surface_y"].std()) / 2

# Mean standard deviation normalization function
def mean_std_normalization(column):
    mean = column.mean()
    std_dev = column.std()
    if column.name not in ("surface_x", "surface_y"):
      normalized_column = (column - mean) / std_dev
    else:
      normalized_column = (column - mean) / coord_std
    return normalized_column

# Apply mean standard deviation normalization to each column
df = df.apply(mean_std_normalization)

# df_normalized now contains mean standard deviation normalized values for each column
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,OilPeakRate,surface_x,surface_y,standardized_operator_name,gross_perforated_length,total_proppant,total_fluid,true_vertical_depth,proppant_intensity,frac_fluid_intensity,...,Undefined,Inner Well,Outer Well,Batch-Concurrent Frac,Batch-Sequential Frac,Non-Batch Frac,Infill Child Well,Sibling Well,Standalone Well,Unknown
0,-1.112048,1.47369,-1.489933,-0.332919,-1.004684,-1.381713,-1.235418,-0.064585,-1.685204,-1.56158,...,-0.233983,-0.571134,-0.668564,-0.560283,-0.412411,-0.757834,-0.485485,-0.756733,1.171525,-0.155543
1,-1.027789,1.460921,-1.511878,-1.248849,-1.334935,-1.303728,-1.240156,-0.110165,-1.376399,-1.323028,...,-0.233983,-0.571134,1.495665,-0.560283,-0.412411,-0.757834,2.059687,-0.756733,-0.853544,-0.155543
2,-1.342079,1.459185,-1.487842,1.360165,-1.069325,-0.937889,-0.842386,-0.016885,-0.628038,-0.481106,...,-0.233983,-0.571134,-0.668564,-0.560283,-0.412411,1.319483,-0.485485,-0.756733,1.171525,-0.155543
3,-0.616681,1.473989,-1.533268,-1.248849,-1.10579,-1.31663,-1.286442,-0.105925,-1.505428,-1.522248,...,-0.233983,-0.571134,-0.668564,-0.560283,-0.412411,-0.757834,-0.485485,-0.756733,1.171525,-0.155543
4,-0.754609,1.45532,-1.549551,-1.248849,-1.068911,-1.304171,-1.216941,-0.090555,-1.488616,-1.367058,...,-0.233983,-0.571134,1.495665,-0.560283,-0.412411,-0.757834,2.059687,-0.756733,-0.853544,-0.155543


In [2]:
import torch
from sklearn.model_selection import train_test_split

X = df[df.drop(["OilPeakRate"], axis=1).columns]
X_train = X.iloc[training_indices] # Uses only the training indexes
X_test = X.drop(training_indices) # Uses all BUT the training indexes
y = df["OilPeakRate"]
y_train = y.iloc[training_indices] # Uses only the training indexes
y_test = y.drop(training_indices) # Uses all BUT the training indexes

X_train, X_test, y_train, y_test = torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(X_test.values, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1), torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

len(X_train)

15442

## Defining the Gaussian estimate functions
An approximation of the oil peak rate value can be acquired by looking at the surrounding points. For each training and testing point, a Gaussian probability density function (PDF) is created. Then, the surrounding training wells will have their peak oil values multiplied by the PDF's value at the well's location. All of these products are summed and divided by the sum of all PDF values used. This is the approximation.

It is important to note that data leaks are avoided because only TRAINING wells are searched for in the surrounding area.

In [3]:
from scipy.spatial import cKDTree
from scipy.stats import norm

QUERY_RADIUS = 0.3 # Radius (in xy-coordinate standard deviations defined further above) at which the estimator will query for training points
STDEV = 0.01 # The standard deviation of the Gaussian distribution itself

def add_gaussian_approximations(train_tensor, train_y, test_tensor):
  """
  Returns the updated training tensors with a new column including the
  Gaussian estimates for each point
  """
  train_tensor, test_tensor, train_y = train_tensor.numpy(), test_tensor.numpy(), train_y.numpy()

  # Getting the xy points
  points = np.array([train_tensor[:,0], train_tensor[:,1]]).transpose()

  # Forming it into a KD tree for fast querying in the long run, very easy to search for nearby points
  kdtree = cKDTree(points)

  # Creating a lookup table for the oil values for each point
  data_dict = {}
  for i, row in enumerate(train_tensor):
    if row[0] not in data_dict:
      data_dict[row[0]] = {}
    data_dict[row[0]][row[1]] = train_y[i,0]

  def gaussian_pdf(x, mean, std_dev):
    """
    Returns the value of the Gaussian PDF at x with a distribution with a given mean and standard deviation
    """
    exponent = -0.5 * ((x - mean) / std_dev) ** 2
    pdf = (1 / (std_dev * np.sqrt(2 * np.pi))) * np.exp(exponent)
    return pdf

  def get_gaussian(point):
    """
    Gets the Gaussian estimate for a single point using the Gaussian PDF
    """
    point = np.array(point)

    # Queries for surrounding points
    other_points = points[kdtree.query_ball_point(point, QUERY_RADIUS)]

    if len(other_points) < 10:
      # If there is nothing in the query neighborhood, just use the entire training set. Should be rare so not much of a slowdown expected.
      other_points = points

    # Gets the list of oil values for each point
    vals = np.array([data_dict[pt[0]][pt[1]] for pt in other_points])

    # Calculates the Gaussian PDF value based on the distance from the point for each nearby training well
    distances = np.linalg.norm(other_points - point, axis=1)
    gaussians = gaussian_pdf(distances, 0, STDEV)
    gaussians, vals = gaussians[distances != 0], vals[distances != 0]

    # Adds the products and normalizes for the final estimate
    result = sum(vals * gaussians) / sum(gaussians)
    return result

  # Handles nan issues, adds the Gaussian column to the train and test data sets.
  train_results = torch.tensor(np.nan_to_num(np.array([get_gaussian([row[0], row[1]]) for row in torch.tensor(train_tensor)]).reshape(-1,1), nan=0))
  train_results = torch.cat((torch.tensor(train_tensor), train_results), dim=1)
  test_results = torch.tensor(np.nan_to_num(np.array([get_gaussian([row[0], row[1]]) for row in torch.tensor(test_tensor)]).reshape(-1,1), nan=0))
  test_results = torch.cat((torch.tensor(test_tensor), test_results), dim=1)
  return train_results, test_results

In [4]:
# Now that the functions are defined, we can calulate the Gaussian approximations!
X_train, X_test = add_gaussian_approximations(X_train, y_train, X_test)

  result = sum(vals * gaussians) / sum(gaussians)


In [5]:
# Validate that there is a correlation between the ground truth y data and the Gaussian estimation ( > 0.6 is amazing compared to other data points alone)
np.corrcoef(X_test[:,-1].numpy().flatten(), y_test.numpy().flatten())[0,1]

0.6598116659699832

## Creating the output file
Adds the gaussian approximation column to the dataframe and saving it as a csv file on the google drive.

In [9]:
df = pd.read_csv('/content/drive/MyDrive/Datathon 2024/Data/preprocessed_training.csv')
result = np.ones(len(df["surface_x"])) * 1000 # Sets 1000 as the arbitrary value of an empty part of the array
result[training_indices] = X_train[:,-1]
result[result == 1000] = X_test[:,-1]
result *= oil_std
result += oil_mean # Reconstructs the original scale of the target values before adding the column to the dataframe
df["gaussian_approximation"] = result
df.to_csv('/content/drive/MyDrive/Datathon 2024/Data/preprocessed_training_gaussian_approximations.csv', index=False)