In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

fairuzazaria_rtm_stuck_prediction_datasets_path = kagglehub.dataset_download('fairuzazaria/rtm-stuck-prediction-datasets')

print('Data source import complete.')


# **DATA GATHERING**

## **1. PREPARATION**

### 1.0. Install Required Libraries

In [None]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


### 1.1. Import Libraries

In [None]:
import os, csv, glob
import math, pyarrow
import datetime, fastparquet

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
from tqdm.notebook import tqdm
from multiprocessing import Lock

tqdm.set_lock(Lock())

### 1.2. Create Required Functions

In [None]:
def get_dataset_directories(pattern: str = 'txt', base: str = '/kaggle/input') -> tuple:
    working_dir = glob.glob(os.path.join(base, '*'), recursive=True)
    dataset_dir = tuple(
        tuple(
            filename for filename in glob.iglob(os.path.join(dir, '**', f'*{pattern}'), recursive=True)
        )
        for dir in working_dir
    )

    return (dataset_dir, working_dir)

In [None]:
def get_content_format(content: list) -> list:
    content[1] = f'{content[0]} {content[1]}'

    return content[1:]

In [None]:
def check_content_length(data: str) -> str:
    if len(data) < 199:
        data = f'{data}{" "*(201-len(data))}'
        return f'{data[0:95]}  {data[95:]}'

    elif len(data) == 199:
        return f'{data[0:95]}  {data[95:]}'

    elif (len(data) == 200) and (data[0:10] == "2023-05-19"):
        data = f'{data[0:30]}    {data[35:]}'
        return f'{data[0:95]}  {data[95:]}'

    elif len(data) == 200:
        return f'{data[0:95]} {data[95:]}'

    elif len(data) == 201:
        return f'{data[0:95]}{data[95:]}'

    else:
        print(data)
        return f'{data[0:95]}{data[95:]}'

In [None]:
def get_content_string_format(data: str) -> str:
    data1 = data.replace('\n', '')
    data  = check_content_length(data1)

    parameters = {
        "date_time": data[20:21],
        "bitdepth": data[29:30],
        "scfm": data[34:35],
        "mudcondin": data[44:45],
        "blockpos": data[53:54],
        "wob": data[59:60],
        "ropi": data[66:67],
        "bvdepth": data[74:75],
        "mudcondout": data[85:86],
        "torque": data[94:95],

        "rpm": data[100:101],
        "hkld": data[106:107],
        "logdepth": data[115:116],
        "h2s1": data[121:122],
        "mudflowoutp": data[133:134],
        "totspm": data[140:141],
        "sppress": data[148:149],
        "mudflowin": data[158:159],
        "co21": data[164:165],
        "gas": data[166:167],
        "mudtempin": data[178:179],
        "mudtempout": data[189:190],
        "tankvoltot": data[200:201]
    }

    for key in parameters.keys():
        if (key != "date_time") and (parameters[key] == " "):
            parameters[key] = "-"

    date_time   = f'{data[0:20]}{parameters["date_time"]}'
    bitdepth    = f'{data[21:29]}{parameters["bitdepth"]}'
    scfm        =  f'{data[30:34]}{parameters["scfm"]}'
    mudcondin   = f'{data[35:44]}{parameters["mudcondin"]}'
    blockpos    = f'{data[45:53]}{parameters["blockpos"]}'
    wob         = f'{data[54:59]}{parameters["wob"]}'
    ropi        = f'{data[60:66]}{parameters["ropi"]}'
    bvdepth     = f'{data[67:74]}{parameters["bvdepth"]}'
    mudcondout  = f'{data[75:85]}{parameters["mudcondout"]}'
    torque      = f'{data[86:94]}{parameters["torque"]}'

    rpm         = f'{data[95:100]}{parameters["rpm"]}'
    hkld        = f'{data[101:106]}{parameters["hkld"]}'
    logdepth    = f'{data[107:115]}{parameters["logdepth"]}'
    h2s1        = f'{data[116:121]}{parameters["h2s1"]}'
    mudflowoutp = f'{data[122:133]}{parameters["mudflowoutp"]}'
    totspm      = f'{data[134:140]}{parameters["totspm"]}'
    sppress     = f'{data[141:148]}{parameters["sppress"]}'
    mudflowin   = f'{data[149:158]}{parameters["mudflowin"]}'
    co21        = f'{data[159:164]}{parameters["co21"]}'
    gas         = f'{data[165:168]}{parameters["gas"]}'
    mudtempin   = f'{data[169:178]}{parameters["mudtempin"]}'
    mudtempout  = f'{data[179:189]}{parameters["mudtempout"]}'
    tankvoltot  = f'{data[190:200]}{parameters["tankvoltot"]}'

    return f"{date_time}{bitdepth}{scfm}{mudcondin}{blockpos}{wob}{ropi}{bvdepth}{mudcondout}{torque}{rpm}{hkld}{logdepth}{h2s1}{mudflowoutp}{totspm}{sppress}{mudflowin}{co21}{gas}{mudtempin}{mudtempout}{tankvoltot}"

In [None]:
def get_content_data(content: str) -> list:
    elements = content.split(' ')
    elements = [element for element in elements if element != '']

    return get_content_format(elements)

In [None]:
def get_content(filepath: str, encoding: str = 'ISO-8859-1') -> list:
    file = open(filepath, "r", encoding = encoding)
    data = file.readlines()
    file.close()

    contents = [get_content_string_format(data[index]) for index in range(1, len(data))]
    content_data = list(map(get_content_data, contents[1:]))
    content_data.insert(0, list(dict.fromkeys(data[0].replace('\n', '').split(' ')))[1:])

    return content_data

In [None]:
def set_content_csv(data: list, path: str, method: str = 'a') -> None:
    if (os.path.exists(path) == False):
        method = 'w'

    with open(path, method, newline='') as csvfile:
        writer = csv.writer(csvfile)
        if (method == 'w'):
            writer.writerow(data[0])

        del data[0]
        writer.writerows(data)

In [None]:
def set_data_label(dataframe: pd.DataFrame, date_range: tuple, date_col: str = 'Date-Time', label_col: str = 'Stuck') -> None:
    for start, end in tqdm(date_range):
        dataframe.loc[dataframe[date_col].between(start, end), label_col] = 1

## **2. DATA GATHERING**

In [None]:
#-- get all directories
directories = get_dataset_directories()
parent_dir  = directories[1]
dataset_dir = directories[0][0]

print(f'fetched {len(dataset_dir)} well data!')

fetched 163 well data!


In [None]:
#-- get directories based on well data
dataset_obj = dict()
for directory in dataset_dir:
    base_dir = os.path.normpath(directory)
    curr_dir = base_dir.split(os.sep)
    well_nme = curr_dir[3 + 1]

    if (well_nme not in dataset_obj.keys()):
        dataset_obj[well_nme] = []

    dataset_obj[well_nme].append(base_dir)

for well_nme, well_vle in dataset_obj.items():
    print(f'fetched {len(well_vle)} {well_nme} text data!')

fetched 55 WELL A text data!
fetched 64 WELL B text data!
fetched 44 WELL C text data!


In [None]:
#-- extract .txt contents
dataset_obj = {key: list(map(get_content, tqdm(dataset_obj[key]))) for key in dataset_obj.keys()}

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

In [None]:
#-- convert .txt contents to .csv
for key in dataset_obj.keys():
    well_nme = key.replace(" ", "_").lower()
    well_pth = os.path.join(os.getcwd(), f'{well_nme}.csv')

    if os.path.exists(well_pth):
        os.remove(well_pth)

    for data in tqdm(dataset_obj[key]):
        set_content_csv(data, well_pth)

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

In [None]:
#-- compress .csv to .parquet
for key in tqdm(dataset_obj.keys()):
    well_nme = key.replace(" ", "_").lower()
    path_crr = os.path.join(os.getcwd(), f'{well_nme}.csv')
    path_dst = os.path.join(os.getcwd(), f'{well_nme}.parquet')

    if os.path.exists(path_dst):
        os.remove(path_dst)

    df = pd.read_csv(path_crr, dtype=str)
    df.to_parquet(path_dst, engine="pyarrow", compression="snappy")

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#-- remove .csv
for key in tqdm(dataset_obj.keys()):
    well_nme = key.replace(" ", "_").lower()
    well_pth = os.path.join(os.getcwd(), f'{well_nme}.csv')

    if os.path.exists(well_pth):
        os.remove(well_pth)

  0%|          | 0/3 [00:00<?, ?it/s]

## **3. DATA LABELLING**

In [None]:
#-- get directories
if os.path.exists('/kaggle/working/__notebook__.ipynb'):
    os.remove('/kaggle/working/__notebook__.ipynb')

dataset_dir = sorted(get_dataset_directories(
    pattern = '.parquet',
    base    = os.getcwd()
)[1])

dataset_dir

['/kaggle/working/well_a.parquet',
 '/kaggle/working/well_b.parquet',
 '/kaggle/working/well_c.parquet']

In [None]:
#-- read datasets
df_a = pq.read_table(dataset_dir[0]).to_pandas()
df_b = pq.read_table(dataset_dir[1]).to_pandas()
df_c = pq.read_table(dataset_dir[2]).to_pandas()

In [None]:
print(f"loaded well a data {df_a.shape}")
print(f"loaded well b data {df_b.shape}")
print(f"loaded well c data {df_c.shape}")

loaded well a data (468638, 23)
loaded well b data (545819, 23)
loaded well c data (374321, 23)


In [None]:
#-- add default label
df_a['Stuck'] = 0
df_b['Stuck'] = 0
df_c['Stuck'] = 0

In [None]:
#-- store updated stuck events
stuck_a = [
    (datetime.datetime(2023, 5, 18, 8, 40), datetime.datetime(2023, 5, 18, 13, 1, 10)),
    (datetime.datetime(2023, 5, 21, 0, 30, 40), datetime.datetime(2023, 5, 21, 4, 21)),
    (datetime.datetime(2023, 5, 22, 14, 3), datetime.datetime(2023, 5, 22, 21, 17)),
    (datetime.datetime(2023, 5, 29, 4, 50, 30), datetime.datetime(2023, 5, 29, 6, 7, 10)),
    (datetime.datetime(2023, 5, 29, 8, 12, 50), datetime.datetime(2023, 5, 30, 23, 59, 59)),

    (datetime.datetime(2023, 6, 5, 17, 59, 40),  datetime.datetime(2023, 6, 5, 18, 54, 50)),
    (datetime.datetime(2023, 6, 17, 17, 18, 50), datetime.datetime(2023, 6, 21, 1, 30)),
    (datetime.datetime(2023, 6, 26, 15, 31), datetime.datetime(2023, 6, 26, 18, 28, 10))
]

stuck_b = [
    (datetime.datetime(2023, 2, 16, 22, 0, 0), datetime.datetime(2023, 2, 21, 17, 0, 0)),

    (datetime.datetime(2023, 3, 10, 22, 51), datetime.datetime(2023, 3, 11, 0, 35, 50)),
    (datetime.datetime(2023, 4, 4, 22, 0), datetime.datetime(2023, 4, 4, 22, 30))
]

stuck_c = [
    (datetime.datetime(2022, 7, 24, 3, 30), datetime.datetime(2022, 7, 29, 1, 0))
]

In [None]:
#-- order datasets
df_a['Date-Time'] = pd.to_datetime(df_a['Date-Time'])
df_b['Date-Time'] = pd.to_datetime(df_b['Date-Time'])
df_c['Date-Time'] = pd.to_datetime(df_c['Date-Time'])

df_a = df_a.sort_values(by=['Date-Time'])
df_b = df_b.sort_values(by=['Date-Time'])
df_c = df_c.sort_values(by=['Date-Time'])

In [None]:
#-- adjust label
set_data_label(df_a, stuck_a)
set_data_label(df_b, stuck_b)
set_data_label(df_c, stuck_c)

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(f"well a data : {df_a.shape}")
print(f"well b data : {df_b.shape}")
print(f"well c data : {df_c.shape}")

well a data : (468638, 24)
well b data : (545819, 24)
well c data : (374321, 24)


In [None]:
#-- check stuck data
labels = [
    tuple(df_a['Stuck'].value_counts()),
    tuple(df_b['Stuck'].value_counts()),
    tuple(df_c['Stuck'].value_counts())
]

print(f"well a -> normal : {labels[0][0]}, stuck : {labels[0][1]}")
print(f"well b -> normal : {labels[1][0]}, stuck : {labels[1][1]}")
print(f"well c -> normal : {labels[2][0]}, stuck : {labels[2][1]}")

well a -> normal : 418046, stuck : 50592
well b -> normal : 503808, stuck : 42011
well c -> normal : 332020, stuck : 42301


## **4. SAVE DATASETS**

In [None]:
#-- set path
compress_pth = []
for key in dataset_obj.keys():
    well_nme = key.replace(" ", "_").lower()
    well_pth = os.path.join(os.getcwd(), f'{well_nme}.parquet')

    compress_pth.append(well_pth)

compress_pth

['/kaggle/working/well_a.parquet',
 '/kaggle/working/well_b.parquet',
 '/kaggle/working/well_c.parquet']

In [None]:
# compress .csv to .parquet
df_a.to_parquet(compress_pth[0], engine="pyarrow", compression="snappy")
df_b.to_parquet(compress_pth[1], engine="pyarrow", compression="snappy")
df_c.to_parquet(compress_pth[2], engine="pyarrow", compression="snappy")