# EDA – Volve Time-Based Telemetry

Goal: quick sanity-checks and cleaning preview for time-indexed CSVs.

- Load raw CSV samples
- Apply `clean_time_log` with `TimeCleanConfig`
- Inspect missingness, distributions, basic correlations



In [2]:
import polars as pl
from pathlib import Path
import warnings

# Lazy load for large second time CSV (avoids OOM by not collecting everything)
path2 = Path("/home/yassi/dl_real_time_drilling/data/raw/time/Norway-Statoil-15_$47$_9-F-12 time.csv")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    lazy_df2 = pl.scan_csv(path2)

# Preview schema without full load (using collect_schema to avoid performance warning)
print("Schema:")
print(lazy_df2.collect_schema())

# Quick sample (first 1000 rows) for EDA - this is fast and low-memory
df2_sample = lazy_df2.head(1000).collect()
print("\nSample shape:", df2_sample.shape)
print("\nSample head:")
print(df2_sample.head())

# Basic stats on sample
print("\nDescribe on sample:")
print(df2_sample.describe())




Schema:
Schema({'': Int64, 'Time s': String, 'MWD Stick-Slip PKtoPK RPM rpm': String, 'Average Hookload kkgf': Float64, 'Hole depth (MD) m': Float64, 'MWD Continuous Inclination dega': String, 'IMP/ARC Non-BHcorr Phase-Shift Resistivity 28-in. at 2 MHz ohm.m': String, 'Block Velocity m/s': String, 'Weight on Bit kkgf': String, 'Block Position m': Float64, 'ARC Gamma Ray (BH corrected) gAPI': String, 'ARC Equivalent Circulating Density g/cm3': String, 'Rate of Penetration m/h': String, 'MWD Shock Peak m/s2': String, 'ARC Shock Level unitless': String, 'Trip/Ream/Drill On Bottom Stat unitless': String, 'MWD Turbine RPM rpm': String, 'Average Rotary Speed rpm': String, 'Pump 1 Stroke Rate 1/min': String, 'ARC Annular Pressure kPa': String, 'STWD_RT ': String, 'Pump 3 Stroke Rate 1/min': String, 'Pass Name unitless': String, 'Bit Depth m': Float64, 'MWD Collar RPM rpm': String, 'Average Surface Torque kN.m': Float64, 'MWD Frame ID unitless': Int64, 'Bit Depth (TVD) m': String, 'Rate of Pen

ComputeError: could not parse `-499.5` as dtype `i64` at column 'MWD Frame ID unitless' (column number 27)

The current offset in the file is 7242368 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `schema_overrides` argument
- setting `ignore_errors` to `True`,
- adding `-499.5` to the `null_values` list.

Original error: ```remaining bytes non-empty```

In [None]:
import fireducks.pandas as pd
from pathlib import Path
import warnings

# Minimal load for third time CSV
path3 = Path("/home/yassi/dl_real_time_drilling/data/raw/time/Norway-Statoil-15_$47$_9-F-7 time.csv")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df3 = pd.read_csv(path3, nrows=5000, on_bad_lines="skip", low_memory=False)
df3.head()
print(f"Shape: {df3.shape}")



In [2]:
# Preview shapes and heads using LOGGER
for key, df in list(datasets.items())[:10]:  # cap preview to first 10 entries
    try:
        LOGGER.info(f"🗂 {key} shape={df.shape}")
        display(df.head(3))
    except Exception as exc:  # noqa: BLE001
        LOGGER.error(f"⚠️ Preview failed for {key}: {exc}")


🧭 INFO | 🗂 depth/Norway-NA-15_$47$_9-F-9 A depth.csv shape=(16670, 115)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Measured Depth m,TOFB s,AVG_CONF unitless,MIN_CONF unitless,Average Rotary Speed rpm,STUCK_RT unitless,Corrected Surface Weight on Bit kkgf,Corrected Total Hookload kkgf,...,MWD Collar RPM rpm,IMP/ARC Non-BHcorr Phase-Shift Resistivity 28-in. at 2 MHz ohm.m,IMP/ARC Phase-Shift Conductivity 40-in. at 2 MHz mS/m,Annular Temperature degC,IMP/ARC Non-BHcorr Phase-Shift Resistivity 40-in. at 2 MHz ohm.m,ARC Gamma Ray (BH corrected) gAPI,IMP/ARC Non-BHcorr Attenuation Resistivity 40-in. at 2 MHz ohm.m,MWD Stick-Slip PKtoPK RPM rpm,IMP/ARC Non-BHcorr Attenuation Resistivity 28-in. at 2 MHz ohm.m,IMP/ARC Phase-Shift Conductivity 28-in. at 2 MHz mS/m
0,0,0,273.101,,,,,,,,...,,,,,,,,,,
1,1,1,273.253,,,,,,,,...,,,,,,,,,,
2,2,2,273.406,,,,,,,,...,,,,,,,,,,


In [1]:
import fireducks.pandas as pd 

In [4]:
data = pd.read_csv("../data/raw/depth/Norway-NA-15_$47$_9-F-9 A depth.csv")
data.head()






Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Measured Depth m,TOFB s,AVG_CONF unitless,MIN_CONF unitless,Average Rotary Speed rpm,STUCK_RT unitless,Corrected Surface Weight on Bit kkgf,Corrected Total Hookload kkgf,...,MWD Collar RPM rpm,IMP/ARC Non-BHcorr Phase-Shift Resistivity 28-in. at 2 MHz ohm.m,IMP/ARC Phase-Shift Conductivity 40-in. at 2 MHz mS/m,Annular Temperature degC,IMP/ARC Non-BHcorr Phase-Shift Resistivity 40-in. at 2 MHz ohm.m,ARC Gamma Ray (BH corrected) gAPI,IMP/ARC Non-BHcorr Attenuation Resistivity 40-in. at 2 MHz ohm.m,MWD Stick-Slip PKtoPK RPM rpm,IMP/ARC Non-BHcorr Attenuation Resistivity 28-in. at 2 MHz ohm.m,IMP/ARC Phase-Shift Conductivity 28-in. at 2 MHz mS/m
0,0,0,273.101,,,,,,,,...,,,,,,,,,,
1,1,1,273.253,,,,,,,,...,,,,,,,,,,
2,2,2,273.406,,,,,,,,...,,,,,,,,,,
3,3,3,273.558,,,,,,,,...,,,,,,,,,,
4,4,4,273.71,,,,,,,,...,,,,,,,,,,


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming df2_sample from cell 1; if not, rerun cell 1
# Step 1: Parse time column to numeric (assuming seconds since start)
df2_sample = df2_sample.with_columns(
    pl.col("Time s").str.strip_chars().str.to_numeric(strict=False).alias("Time_s_numeric")
).drop("Time s")  # Drop original if parsed successfully

# Step 2: Convert obvious numeric strings (handle errors to NaN)
numeric_cols = [
    "Average Hookload kkgf",
    "Hole depth (MD) m",
    "Block Position m",
    "Bit Depth m",
    "Average Surface Torque kN.m",
    "Bit on Bottom unitless",
    "Bit Depth (MD) m",
    "Average Standpipe Pressure kPa",
    "Rate of Penetration m/h",  # Add more as needed
    "Average Rotary Speed rpm",
    # ... extend based on domain knowledge
]

for col in numeric_cols:
    if col in df2_sample.columns:
        df2_sample = df2_sample.with_columns(
            pl.col(col).str.strip_chars().str.to_numeric(strict=False).alias(col)
        )

# Check for parsing success (null counts)
print("Missing values after parsing (top 10 columns):")
print(df2_sample.null_count().head(10))

# Step 3: Basic EDA on cleaned sample
print("\nTime range:", df2_sample["Time_s_numeric"].min(), "to", df2_sample["Time_s_numeric"].max())

# Plot key time-series (e.g., ROP and depth over time)
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

# ROP vs Time
rop_series = df2_sample.select(["Time_s_numeric", "Rate of Penetration m/h"]).drop_nulls()
if len(rop_series) > 0:
    axes[0].plot(rop_series["Time_s_numeric"], rop_series["Rate of Penetration m/h"])
    axes[0].set_title("Rate of Penetration over Time")
    axes[0].set_xlabel("Time (s)")
    axes[0].set_ylabel("ROP (m/h)")

# Depth progression
depth_series = df2_sample.select(["Time_s_numeric", "Hole depth (MD) m"]).drop_nulls()
if len(depth_series) > 0:
    axes[1].plot(depth_series["Time_s_numeric"], depth_series["Hole depth (MD) m"])
    axes[1].set_title("Hole Depth over Time")
    axes[1].set_xlabel("Time (s)")
    axes[1].set_ylabel("Depth (m)")

plt.tight_layout()
plt.show()

# Correlation heatmap for numeric cols (subset to avoid clutter)
numeric_df = df2_sample.select([col for col in df2_sample.columns if df2_sample[col].dtype in [pl.Float64, pl.Int64]]).to_pandas()
if not numeric_df.empty:
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_df.corr(), annot=False, cmap="coolwarm")
    plt.title("Correlation Heatmap (Numeric Columns)")
    plt.show()
