# Import Dependencies

In [1]:
import csv
import pandas as pd
import os
import pyproj
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path

# Initial filtering and cleaning of the text file

In [6]:
# List of substrings to be removed
remove_substrings = ["PROPERTY: Pressure", "UNITS:    psi", "<", ">", "** "]

In [9]:
# Open the file
with open("../../data/data_seis/ndb_deep_oct2023_Pressure_allLayers_starting20170101.txt", "r") as file:
    # Read the lines and filter out lines containing specified substrings
    lines = []
    for line in file:
        if "** TIME" in line:
            line = line.replace("** ", "", 1)  # Remove only the first occurrence of "** "
            line = line.replace(":", "", 1)  # Remove only the first occurrence of "** "
        elif "<" in line or ">" in line:  # Check for "<" or ">" in the line
            line = line.replace("<", "").replace(">", "")  # Remove "<" and ">"
        elif any(substring in line for substring in remove_substrings):
            continue  # Skip lines containing other specified substrings
        lines.append(line.strip())
        

In [5]:
# Export filtered lines to a new text file
with open("../../data/data_seis/filtered_output.txt", "w") as output_file:
    for line in lines:
        output_file.write(line + "\n")

# Clean the new text file

In [22]:
# Read the data from the file
with open('../../data/data_seis/filtered_output.txt', 'r') as file:
    lines = file.readlines()

In [29]:
# Iterate through the lines
for i, line in enumerate(lines):
    if "Layer    1" in line:
        lines[i] = line.replace("Layer    1", "Layer_1")
    elif "Layer    2" in line:
        lines[i] = line.replace("Layer    2", "Layer_2")
    elif "Layer    3" in line:
        lines[i] = line.replace("Layer    3", "Layer_3")
    elif "Layer    4" in line:
        lines[i] = line.replace("Layer    4", "Layer_4")
    elif "Layer    5" in line:
        lines[i] = line.replace("Layer    5", "Layer_5")
    elif "Layer    6" in line:
        lines[i] = line.replace("Layer    6", "Layer_6")
    elif "Layer    7" in line:
        lines[i] = line.replace("Layer    7", "Layer_7")
    elif "Layer    8" in line:
        lines[i] = line.replace("Layer    8", "Layer_8")
    elif "Layer    9" in line:
        lines[i] = line.replace("Layer    9", "Layer_9")
    elif "Layer   10" in line:
        lines[i] = line.replace("Layer   10", "Layer_10")
    elif "Layer   11" in line:
        lines[i] = line.replace("Layer   11", "Layer_11")
    elif "Layer   12" in line:
        lines[i] = line.replace("Layer   12", "Layer_12")
    elif "Layer   13" in line:
        lines[i] = line.replace("Layer   13", "Layer_13")
    elif "Layer   14" in line:
        lines[i] = line.replace("Layer   14", "Layer_14")
    elif "Layer   15" in line:
        lines[i] = line.replace("Layer   15", "Layer_15")
    elif "Layer   16" in line:
        lines[i] = line.replace("Layer   16", "Layer_16")
    elif "Layer   17" in line:
        lines[i] = line.replace("Layer   17", "Layer_17")
    elif "Layer   18" in line:
        lines[i] = line.replace("Layer   18", "Layer_18")
    elif "Layer   19" in line:
        lines[i] = line.replace("Layer   19", "Layer_19")
    elif "Layer   20" in line:
        lines[i] = line.replace("Layer   20", "Layer_20")
    

In [30]:
# Write the modified lines to a new file
with open('../../data/data_seis/output_file.txt', 'w') as file:
    file.writelines(lines)

print("Replacement completed!")

Replacement completed!


# Convert the text file into a CSV

In [None]:
# Read the data from the text file
with open('../../data/data_seis/output_file.txt', 'r') as file:
    lines = file.readlines()

# Initialize variables to store the CSV data
csv_data = []
current_time = None

# Process each line of the input data
for line in lines:
    line = line.strip()  # Remove leading/trailing whitespace
    if line.startswith("TIME"):
        current_time = line.split()[1]  # Extract the TIME value
    elif line:  # Non-empty line
        parts = line.split()  # Split the line by whitespace
        print(parts)
        if len(parts) >= 3:  # Check if line has enough elements
            x, y, layer_value = parts[0], parts[1], parts[2]
            layer_number = parts[-1]  # Extract layer number from the header
            csv_data.append([current_time, x, y, layer_value, layer_number])

# Write the CSV data to a new file
with open('../../data/data_seis/output.csv', 'w') as csv_file:
    # Write headers
    csv_file.write('Time,X,Y,Layer,Layer Number\n')
    # Write data
    for entry in csv_data:
        csv_file.write(','.join(entry) + '\n')


# Clean the data in the CSV file

### Create a DataFrame using the CSV file

In [3]:
# Name of your CSV file
filename = '../../data/data_seis/output.csv'

# Construct the file path using os.path.join
file_path = os.path.join(filename)

In [4]:
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [5]:
# Rename columns
df = df.rename(columns={
    'Time' : 'Time',
    'X' : 'X',
    'Y' : 'Y',
    'Layer' : 'Pressure',
    'Layer Number' : 'Layer'
})

In [6]:
df.head()

Unnamed: 0,Time,X,Y,Pressure,Layer
0,41670,X,Y,Layer_1,Layer_1
1,41670,2106604.41,3434119.83,7952.25,7952.25
2,41670,2111884.40,3434119.83,7970.05,7970.05
3,41670,2037964.57,3439399.82,7658.27,7658.27
4,41670,2043244.56,3439399.82,7754,7754


### Replace the values in the Layer column with the actual Layer Numbers

In [7]:
# Duplicate the dataframe
df_dupe = df

In [8]:
# Initialize number variable
number = None

# Iterate over each row
for index, row in df_dupe.iterrows():
    if row['Pressure'].startswith('Layer_'):
        number = row['Pressure']

    df_dupe.at[index, 'Layer'] = number

In [11]:
# Display the DataFrame
df_dupe.head()

Unnamed: 0,Time,X,Y,Pressure,Layer
0,41670,X,Y,Layer_1,Layer_1
1,41670,2106604.41,3434119.83,7952.25,Layer_1
2,41670,2111884.40,3434119.83,7970.05,Layer_1
3,41670,2037964.57,3439399.82,7658.27,Layer_1
4,41670,2043244.56,3439399.82,7754,Layer_1


In [15]:
# Filter the DataFrame to remove rows where 'X' column is equal to 'X'
df_dupe = df_dupe[df_dupe['X'] != 'X']

In [19]:
# Display the DataFrame
df_dupe.head()

Unnamed: 0,Time,X,Y,Pressure,Layer
1,41670,2106604.41,3434119.83,7952.25,Layer_1
2,41670,2111884.4,3434119.83,7970.05,Layer_1
3,41670,2037964.57,3439399.82,7658.27,Layer_1
4,41670,2043244.56,3439399.82,7754.0,Layer_1
5,41670,2048524.55,3439399.82,7828.24,Layer_1


### Covert the Time column values to DateTime

In [20]:
# Duplicate the df_dupe DataFrame
df_time = df_dupe

In [21]:
# Convert time to dates
df_time['Time'] = pd.to_datetime('1902-12-01') + pd.to_timedelta(df_time['Time'], unit='D')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_time['Time'] = pd.to_datetime('1902-12-01') + pd.to_timedelta(df_time['Time'], unit='D')


In [25]:
# Display the DataFrame with the updated values
df_time.head()

Unnamed: 0,Time,X,Y,Pressure,Layer
1,2017-01-01,2106604.41,3434119.83,7952.25,Layer_1
2,2017-01-01,2111884.4,3434119.83,7970.05,Layer_1
3,2017-01-01,2037964.57,3439399.82,7658.27,Layer_1
4,2017-01-01,2043244.56,3439399.82,7754.0,Layer_1
5,2017-01-01,2048524.55,3439399.82,7828.24,Layer_1


In [26]:
# Export df_time to a CSV file
df_time.to_csv('../../data/data_seis/Updated_Pressure_Data.csv', index=False)

# Convert the Coordinate system from NAD 1927 to WGS 84

In [27]:
# Define the custom Lambert Conformal Conic projection
proj_string = """
    +proj=lcc +datum=NAD83 +ellps=GRS80 +lon_0=-100 +lat_1=27.4166666666667
    +lat_2=34.9166666666667 +lat_0=31.1666666666667 +x_0=999999.9998984
    +y_0=999999.9998984 +unit=us-ft +no_defs
"""

# Initialize the Transformer object
transformer = pyproj.Transformer.from_proj(
    pyproj.Proj(proj_string),  # Source coordinate system
    pyproj.Proj(proj='latlong', datum='WGS84'),  # Destination coordinate system
    always_xy=True  
)

# Load the CSV file in chunks
csv_file_path = '../../data/data_seis/Updated_Pressure_Data.csv' 

# Set up a CSV writer to write the output to a new file
output_csv_path = '../../data/data_seis/Updated_Pressure_Data_with_LatLon.csv'
chunksize = 10000  # Process 10,000 rows at a time

# Write the header first
first_chunk = True

for chunk in pd.read_csv(csv_file_path, chunksize=chunksize):
    # Convert from US feet to meters
    chunk['X_meters'] = chunk['X'] * 0.3048
    chunk['Y_meters'] = chunk['Y'] * 0.3048

    # Transform to lat, lon
    chunk['Longitude'], chunk['Latitude'] = transformer.transform(chunk['X_meters'].values, chunk['Y_meters'].values)  

    # Drop the intermediate columns
    chunk.drop(['X_meters', 'Y_meters', 'X', 'Y'], axis=1, inplace=True)

    # Write the chunk to a CSV
    mode = 'w' if first_chunk else 'a'
    header = first_chunk  # Write header only in the first chunk
    chunk.to_csv(output_csv_path, mode=mode, index=False, header=header)
    first_chunk = False

# Return the path to the new CSV file
output_csv_path

'Updated_Pressure_Data_with_LatLon.csv'

# Delta Pressure Calculation

### Layer 13

In [2]:
# Read the CSV file again
df = pd.read_csv('../../data/data_seis/Updated_Pressure_Data_with_LatLon.csv')

df['Time'] = pd.to_datetime(df['Time'])
layer_13_df = df[df['Layer'] == 'Layer_13']

# Group by Longitude and Latitude for unique locations within Layer_13
grouped = layer_13_df.groupby(['Longitude', 'Latitude'])

# Initialize an empty DataFrame for results
delta_pressure_df = pd.DataFrame()

# Iterate through each group
for (longitude, latitude), group in grouped:
    # Filter to get the reference pressure on 2017-01-01 for each location
    reference_pressure_row = group[group['Time'] == pd.Timestamp('2017-01-01')]
    if not reference_pressure_row.empty:
        reference_pressure = reference_pressure_row.iloc[0]['Pressure']
        # Calculate delta pressure for each row in the group by subtracting the reference pressure
        group['Delta_Pressure'] = group['Pressure'] - reference_pressure
        # Append the processed group to the results DataFrame
        delta_pressure_df = pd.concat([delta_pressure_df, group])

# Resetting index of the final DataFrame
delta_pressure_df.reset_index(drop=True, inplace=True)

# Save the results DataFrame for this layer to a CSV file
output_path = f'../../data/data_seis/delta_pressure_layer13.csv'
delta_pressure_df.to_csv(output_path, index=False)


# Checking the first few rows of the final DataFrame
delta_pressure_df.head()

Unnamed: 0,Time,Pressure,Layer,Longitude,Latitude,Delta_Pressure
0,2017-01-01,2580.6,Layer_13,-105.142041,32.563859,0.0
1,2017-02-01,2580.65,Layer_13,-105.142041,32.563859,0.05
2,2017-03-01,2580.68,Layer_13,-105.142041,32.563859,0.08
3,2017-04-01,2580.72,Layer_13,-105.142041,32.563859,0.12
4,2017-05-01,2580.76,Layer_13,-105.142041,32.563859,0.16


### Layer 9

In [3]:
# Read the CSV file again
df = pd.read_csv('../../data/data_seis/Updated_Pressure_Data_with_LatLon.csv')

df['Time'] = pd.to_datetime(df['Time'])
layer_9_df = df[df['Layer'] == 'Layer_9']

# Group by Longitude and Latitude for unique locations within Layer_9
grouped = layer_9_df.groupby(['Longitude', 'Latitude'])

# Initialize an empty DataFrame for results
delta_pressure_df = pd.DataFrame()

# Iterate through each group
for (longitude, latitude), group in grouped:
    # Filter to get the reference pressure on 2017-01-01 for each location
    reference_pressure_row = group[group['Time'] == pd.Timestamp('2017-01-01')]
    if not reference_pressure_row.empty:
        reference_pressure = reference_pressure_row.iloc[0]['Pressure']
        # Calculate delta pressure for each row in the group by subtracting the reference pressure
        group['Delta_Pressure'] = group['Pressure'] - reference_pressure
        # Append the processed group to the results DataFrame
        delta_pressure_df = pd.concat([delta_pressure_df, group])

# Resetting index of the final DataFrame
delta_pressure_df.reset_index(drop=True, inplace=True)

# Save the results DataFrame for this layer to a CSV file
output_path = f'../../data/data_seis/delta_pressure_layer9.csv'
delta_pressure_df.to_csv(output_path, index=False)


# Checking the first few rows of the final DataFrame
delta_pressure_df.head()

Unnamed: 0,Time,Pressure,Layer,Longitude,Latitude,Delta_Pressure
0,2017-01-01,2152.14,Layer_9,-105.142041,32.563859,0.0
1,2017-02-01,2152.19,Layer_9,-105.142041,32.563859,0.05
2,2017-03-01,2152.23,Layer_9,-105.142041,32.563859,0.09
3,2017-04-01,2152.27,Layer_9,-105.142041,32.563859,0.13
4,2017-05-01,2152.31,Layer_9,-105.142041,32.563859,0.17


### Layer 11

In [4]:
# Read the CSV file again
df = pd.read_csv('../../data/data_seis/Updated_Pressure_Data_with_LatLon.csv')

df['Time'] = pd.to_datetime(df['Time'])
layer_11_df = df[df['Layer'] == 'Layer_11']

# Group by Longitude and Latitude for unique locations within Layer_11
grouped = layer_11_df.groupby(['Longitude', 'Latitude'])

# Initialize an empty DataFrame for results
delta_pressure_df = pd.DataFrame()

# Iterate through each group
for (longitude, latitude), group in grouped:
    # Filter to get the reference pressure on 2017-01-01 for each location
    reference_pressure_row = group[group['Time'] == pd.Timestamp('2017-01-01')]
    if not reference_pressure_row.empty:
        reference_pressure = reference_pressure_row.iloc[0]['Pressure']
        # Calculate delta pressure for each row in the group by subtracting the reference pressure
        group['Delta_Pressure'] = group['Pressure'] - reference_pressure
        # Append the processed group to the results DataFrame
        delta_pressure_df = pd.concat([delta_pressure_df, group])

# Resetting index of the final DataFrame
delta_pressure_df.reset_index(drop=True, inplace=True)

# Save the results DataFrame for this layer to a CSV file
output_path = f'../../data/data_seis/delta_pressure_layer11.csv'
delta_pressure_df.to_csv(output_path, index=False)


# Checking the first few rows of the final DataFrame
delta_pressure_df.head()

Unnamed: 0,Time,Pressure,Layer,Longitude,Latitude,Delta_Pressure
0,2017-01-01,2368.82,Layer_11,-105.142041,32.563859,0.0
1,2017-02-01,2368.86,Layer_11,-105.142041,32.563859,0.04
2,2017-03-01,2368.9,Layer_11,-105.142041,32.563859,0.08
3,2017-04-01,2368.94,Layer_11,-105.142041,32.563859,0.12
4,2017-05-01,2368.98,Layer_11,-105.142041,32.563859,0.16


### Layer 19

In [5]:
# Read the CSV file again
df = pd.read_csv('../../data/data_seis/Updated_Pressure_Data_with_LatLon.csv')

df['Time'] = pd.to_datetime(df['Time'])
layer_19_df = df[df['Layer'] == 'Layer_19']

# Group by Longitude and Latitude for unique locations within Layer_19
grouped = layer_19_df.groupby(['Longitude', 'Latitude'])

# Initialize an empty DataFrame for results
delta_pressure_df = pd.DataFrame()

# Iterate through each group
for (longitude, latitude), group in grouped:
    # Filter to get the reference pressure on 2017-01-01 for each location
    reference_pressure_row = group[group['Time'] == pd.Timestamp('2017-01-01')]
    if not reference_pressure_row.empty:
        reference_pressure = reference_pressure_row.iloc[0]['Pressure']
        # Calculate delta pressure for each row in the group by subtracting the reference pressure
        group['Delta_Pressure'] = group['Pressure'] - reference_pressure
        # Append the processed group to the results DataFrame
        delta_pressure_df = pd.concat([delta_pressure_df, group])

# Resetting index of the final DataFrame
delta_pressure_df.reset_index(drop=True, inplace=True)

# Save the results DataFrame for this layer to a CSV file
output_path = f'../../data/data_seis/delta_pressure_layer19.csv'
delta_pressure_df.to_csv(output_path, index=False)


# Checking the first few rows of the final DataFrame
delta_pressure_df.head()

Unnamed: 0,Time,Pressure,Layer,Longitude,Latitude,Delta_Pressure
0,2017-01-01,2946.01,Layer_19,-105.142041,32.563859,0.0
1,2017-02-01,2946.03,Layer_19,-105.142041,32.563859,0.02
2,2017-03-01,2946.06,Layer_19,-105.142041,32.563859,0.05
3,2017-04-01,2946.08,Layer_19,-105.142041,32.563859,0.07
4,2017-05-01,2946.11,Layer_19,-105.142041,32.563859,0.1


### All Layers

In [6]:
# Function to calculate delta pressure
def calculate_delta_pressure(group):
    # Sort group by time
    group = group.sort_values('Time')
    # Get the reference pressure from the first date (assumed to be the earliest in the group)
    reference_pressure = group.iloc[0]['Pressure']
    # Calculate delta pressure
    group['Delta'] = group['Pressure'] - reference_pressure
    return group

# Read CSV
df_path = '../../data/data_seis/Updated_Pressure_Data_with_LatLon.csv'
df = pd.read_csv(df_path)

# Convert 'Time' to datetime
df['Time'] = pd.to_datetime(df['Time'])

# Get unique layers
layers = df['Layer'].unique()

# Initialize a DataFrame to store all the delta pressures
all_layers_delta_df = pd.DataFrame()

# Calculate delta pressure for each layer and append to the all_layers_delta_df DataFrame
for layer in layers:
    layer_df = df[df['Layer'] == layer]
    # Group by 'Longitude' and 'Latitude' to handle each location separately
    grouped = layer_df.groupby(['Longitude', 'Latitude'])
    # Apply the calculate_delta_pressure function to each group
    delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
    # Append to the all_layers_delta_df DataFrame
    all_layers_delta_df = pd.concat([all_layers_delta_df, delta_df], ignore_index=True)

# Save the combined DataFrame to a CSV file without headers for subsequent rows
output_file = '../../data/data_seis/delta_pressure_all_layers.csv'
all_layers_delta_df.to_csv(output_file, index=False)

all_layers_delta_df.head(), output_file

  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df = grouped.apply(calculate_delta_pressure).reset_index(drop=True)
  delta_df =

(        Time  Pressure    Layer   Longitude   Latitude  Delta
 0 2017-01-01   1595.32  Layer_1 -105.142041  32.563859   0.00
 1 2017-02-01   1595.37  Layer_1 -105.142041  32.563859   0.05
 2 2017-03-01   1595.42  Layer_1 -105.142041  32.563859   0.10
 3 2017-04-01   1595.47  Layer_1 -105.142041  32.563859   0.15
 4 2017-05-01   1595.52  Layer_1 -105.142041  32.563859   0.20,
 '../../data/data_seis/delta_pressure_all_layers.csv')