# Data cleaning
## Oil production

In this notebook, I clean and filter oil production data.

### Libraries

In [4]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import glob
import re
import sys

In [5]:
sys.path.append(os.path.abspath(os.path.join("..", "..")))

In [6]:
from config import config

'float64'

In [None]:
import sys
print(sys.path)

### Import data

In [None]:
# Relative path
PATH = os.path.join("..", "..", "data", "oil_production")

In [None]:
# Column dictionary
prod_data_dtypes = (
    {
        'Cuenca':'object',
        'Asignación_o_Contrato':'object',
        'Nombre_del_pozo':'object',
        'Petróleo_(Mbd)':'float64',
        'Gas_asociado_(MMpcd)':'float64',
        'Gas__no_asociado_(MMpcd)':'float64',
        'Agua_(Mbd)':'float64',
        'Condensado_(Mbd)':'float64'
    }
)

In [None]:
# All csv file paths
file_ext = 'csv'
prod_csv_names = [i for i in glob.glob(PATH+'/prod_*.{}'.format(file_ext))]

# Production dataframe
df_prod = (
    pd.concat(
        [
            pd
            .read_csv(
                f,
                encoding='latin-1',
                skiprows=10,
                dtype=prod_data_dtypes,
                parse_dates=['Fecha'],
                date_format="%d-%m-%Y"
            )
            for f in prod_csv_names
        ],
        ignore_index=True
    )
)

In [None]:
# New column names
prod_names = (
    {
        'Fecha':'fecha',
        'Cuenca':'cuenca',
        'Asignación_o_Contrato':'asignacion_contrato',
        'Nombre_del_pozo':'nombre_pozo',
        'Petróleo_(Mbd)':'petroleo_mbd',
        'Gas_asociado_(MMpcd)':'gas_asociado_mmpcd',
        'Gas_no_asociado_(MMpcd)':'gas_no_asociado_mmpcd',
        'Agua_(Mbd)':'agua_mbd',
        'Condensado_(Mbd)':'condensado_mbd'
    }
)

# Apply renaming
df_prod.rename(columns=prod_names, inplace=True)

# Rename oil basins
df_prod.loc[df_prod['cuenca'] == 'CINTURON PLEGADO DE CHIAPAS', 'cuenca'] = 'CHIAPAS'
df_prod.loc[df_prod['cuenca'] == 'CUENCAS DEL SURESTE', 'cuenca'] = 'SURESTE' 

# Avodi whitespace in well EL TREINTA 12DES
df_prod.loc[df_prod['nombre_pozo'] == 'EL TREINTA-13DES', 'nombre_pozo'] = 'ELTREINTA-13DES'

How many oil wells are there?

In [None]:
# Número inicial de pozos
n0_wells = df_prod['nombre_pozo'].nunique()

# Imprimir conteo inicial
(
    print(
        f"""
    OIL PRODUCTION DATABASE
    -----------------------

        Wells:      {n0_wells:>7,}
        Basins:     {df_prod['cuenca'].nunique():>7,}

        -------------------

        Starts:     {(df_prod['fecha'].dt.year).min():>7}
        Ends:       {(df_prod['fecha'].dt.year).max():>7}

        """
    )
)

In [None]:
df_prod