In [None]:
data preprocessing steps : data cleaning and signal filtering

In [None]:
import pandas as pd
import numpy as np
from scipy.signal import butter, lfilter

print("---Simulating Raw Data---")
np.random.seed(42)
depth = np.arange(0, 500, 0.1)
# Create a noisy signal for Rate of Penetration (ROP)
rop_signal = np.sin(depth / 50) * 10 + 30
noise = np.random.normal(0, 3, rop_signal.shape)
noisy_rop = rop_signal + noise

# Introduce missing values and outliers for cleaning
noisy_rop[100:110] = np.nan # Missing values
noisy_rop[250] = 100 # Outlier
noisy_rop[400] = -50 # Another outlier

df = pd.DataFrame({'TVD': depth, 'ROP': noisy_rop})

# --- 2. Data Cleaning ---
print("\n---Cleaning Data---")
# Fill missing values using linear interpolation
df['ROP_cleaned'] = df['ROP'].interpolate(method='linear')
print("Missing values handled.")

# Remove outliers using the Interquartile Range (IQR) method
Q1 = df['ROP_cleaned'].quantile(0.25)
Q3 = df['ROP_cleaned'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df['ROP_cleaned'] = np.where(df['ROP_cleaned'] > upper_bound, upper_bound, df['ROP_cleaned'])
df['ROP_cleaned'] = np.where(df['ROP_cleaned'] < lower_bound, lower_bound, df['ROP_cleaned'])
print("Outliers removed.")

# --- 3. Signal Processing Filters ---
print("\n---Applying Signal Filters---")
# Rolling Mean Filter
window_size_mean = 20
df['ROP_rolling_mean'] = df['ROP_cleaned'].rolling(window=window_size_mean, center=True).mean()
print(f"Rolling Mean applied with window size: {window_size_mean}.")

# Low-pass Filter
def butter_lowpass_filter(data, cutoff, fs, order):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    y = lfilter(b, a, data)
    return y

fs = 1000.0 # Sample rate, Hz
cutoff = 50.0 # Desired cutoff frequency of the filter, Hz
order = 2 # The order of the filter
df['ROP_lowpass'] = butter_lowpass_filter(df['ROP_cleaned'], cutoff, fs, order)
print(f"Low-pass filter applied with cutoff frequency: {cutoff}.")

print("\nData processing complete. You can now visualize or use the filtered data.")

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error

# --- 1. Simulate a Labeled Dataset ---
 # features = ['TVD', 'SWOB', 'STOR', 'ROP', 'GR', 'RPM', 'HoleSize', 'TFLO']
# target = 'DTCO' (Sonic Log)
print("---Simulating Labeled Data---")
np.random.seed(42)
data_size = 5000
df_ml = pd.DataFrame({
    'TVD': np.linspace(0, 500, data_size),
    'SWOB': np.random.rand(data_size) * 100,
    'ROP': np.random.rand(data_size) * 50,
    'GR': np.random.rand(data_size) * 120,
    # A dummy target variable (Sonic Log) with some noise
    'DTCO': (np.sin(np.linspace(0, 20, data_size)) * 50 + 100) + np.random.normal(0, 5, data_size)
})
features = ['TVD', 'SWOB', 'ROP', 'GR']
target = 'DTCO'

# --- 2. Data Splitting and Scaling ---
X = df_ml[features]
y = df_ml[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 3. Model Training and Evaluation ---
print("\n---Training and Evaluating Models---")

# XGBoost Regressor
print("\nTraining XGBoost...")
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)
print(f"XGBoost R-squared: {r2_score(y_test, xgb_pred):.4f}")

# AdaBoost Regressor
print("\nTraining AdaBoost...")
adaboost_model = AdaBoostRegressor(n_estimators=100, random_state=42)
adaboost_model.fit(X_train_scaled, y_train)
adaboost_pred = adaboost_model.predict(X_test_scaled)
print(f"AdaBoost R-squared: {r2_score(y_test, adaboost_pred):.4f}")

# Artificial Neural Network (ANN)
print("\nTraining ANN...")
ann_model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500, random_state=42)
ann_model.fit(X_train_scaled, y_train)
ann_pred = ann_model.predict(X_test_scaled)
print(f"ANN R-squared: {r2_score(y_test, ann_pred):.4f}")


Hybrid CNN-LSTM Model for Time Series Prediction

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

# --- 1. Simulate Sequential Data ---
print("---Simulating Time-Series Data---")
np.random.seed(42)
data_size = 5000
depth = np.linspace(0, 500, data_size)
# Simulate sequential features
features_ts = pd.DataFrame({
    'ROP': np.sin(depth / 10) * 10 + np.random.normal(0, 1, data_size),
    'GR': np.cos(depth / 15) * 20 + np.random.normal(0, 2, data_size),
    'TFLO': np.sin(depth / 12) * 5 + np.random.normal(0, 0.5, data_size)
})
# Simulate the sequential target (Sonic Log)
dtco_ts = (np.sin(depth / 10) * 50 + 100) + np.random.normal(0, 5, data_size)

# --- 2. Preprocessing for CNN-LSTM ---
print("\n---Preprocessing Data for CNN-LSTM---")
# Normalize data
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features_ts)
dtco_scaled = scaler.fit_transform(dtco_ts.values.reshape(-1, 1))

# Create sequences for the model
def create_sequences(features, target, time_steps):
    X, y = [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

time_steps = 10
X_seq, y_seq = create_sequences(features_scaled, dtco_scaled, time_steps)
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.15, random_state=42, shuffle=False)
# Note: Shuffle is set to False to maintain the sequential nature of the data

# --- 3. Build and Train Hybrid CNN-LSTM Model ---
print("\n---Building and Training Hybrid CNN-LSTM Model---")
model = Sequential()
# CNN layer to extract features
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
# LSTM layer to learn sequence patterns
model.add(LSTM(50, activation='relu'))
model.add(Dropout(0.2))
# Output layer
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1, verbose=1)

# --- 4. Evaluate Model ---
print("\n---Evaluating Model---")
y_pred_scaled = model.predict(X_test)
# Inverse scale the predictions to original values
y_pred = scaler.inverse_transform(y_pred_scaled)
y_test_orig = scaler.inverse_transform(y_test)
print(f"Hybrid CNN-LSTM R-squared: {r2_score(y_test_orig, y_pred):.4f}")