In [53]:
import pandas as pd
import numpy as np
import regex as re

In [51]:
# pandas.errors.UndefinedVariableError occurs when the variable or column name does not exist
df = pd.DataFrame({"RCFD1111": [1, 2, 3], "RCFD2222": [1, 2, 3], "RCFD3333": [1, 2, 3]})
try:
    eval_string = "RCFD1111 + RCFD5555"
    df[eval_string] = df.eval(eval_string)
except Exception as e:
    print(type(e))
df


<class 'pandas.errors.UndefinedVariableError'>


Unnamed: 0,RCFD1111,RCFD2222,RCFD3333
0,1,1,1
1,2,2,2
2,3,3,3


In [50]:
# Any value containg a null, when evaluated, will always result in a null
df = pd.DataFrame({"RCFD1111": [1, 2, 3], "RCFD2222": [np.nan, 2, 3], "RCFD3333": [1, 2, 3]})
try:
    eval_string = "RCFD1111 + RCFD2222"
    df[eval_string] = df.eval(eval_string)
except Exception as e:
    print(type(e))
df


Unnamed: 0,RCFD1111,RCFD2222,RCFD3333,RCFD1111 + RCFD2222
0,1,,1,
1,2,2.0,2,4.0
2,3,3.0,3,6.0


In [49]:
# Missing column name or variable will always have precedence over null values
df = pd.DataFrame({"RCFD1111": [1, 2, 3], "RCFD2222": [np.nan, 2, 3], "RCFD3333": [1, 2, 3]})
try:
    eval_string = "RCFD1111 + RCFD2222 + RCFD5555"
    df[eval_string] = df.eval(eval_string)
except Exception as e:
    print(type(e))
df


<class 'pandas.errors.UndefinedVariableError'>


Unnamed: 0,RCFD1111,RCFD2222,RCFD3333
0,1,,1
1,2,2.0,2
2,3,3.0,3


In [48]:
# If the denominator contains a 0, the resulting value will be inf
df = pd.DataFrame({"RCFD1111": [1, 2, 3], "RCFD2222": [1, 2, 3], "RCFD3333": [0, 2, 3]})
try:
    eval_string = "RCFD1111 / RCFD3333"
    df[eval_string] = df.eval(eval_string)
except Exception as e:
    print(e)
    print(type(e))
df

Unnamed: 0,RCFD1111,RCFD2222,RCFD3333,RCFD1111 / RCFD3333
0,1,1,0,inf
1,2,2,2,1.0
2,3,3,3,1.0


In [47]:
# If the denominator contains a null, the resulting value will be null
df = pd.DataFrame({"RCFD1111": [1, 2, 3], "RCFD2222": [1, 2, 3], "RCFD3333": [np.nan, 2, 3]})
try:
    eval_string = "RCFD1111 / RCFD3333"
    df[eval_string] = df.eval(eval_string)
except Exception as e:
    print(e)
    print(type(e))
df

Unnamed: 0,RCFD1111,RCFD2222,RCFD3333,RCFD1111 / RCFD3333
0,1,1,,
1,2,2,2.0,1.0
2,3,3,3.0,1.0


In [66]:
# 1. Using regex, extract the column names for individual MDRM codes
pattern = r'^[A-Z]{4}[A-Z0-9]{4}'

# If the denominator contains a null, the resulting value will be null
df = pd.DataFrame({"RCFD1111": [1, 2, 0], "RCFD2222": [1, 2, 3], "RCFD3333": [np.nan, 0, 0]})
try:
    eval_string = "RCFD1111 / RCFD3333"
    mdrm_columns = re.findall(pattern, eval_string)
    df[eval_string] = df.eval(eval_string)
except Exception as e:
    print(e)
    print(type(e))
df

Unnamed: 0,RCFD1111,RCFD2222,RCFD3333,RCFD1111 / RCFD3333
0,1,1,,
1,2,2,0.0,inf
2,0,3,0.0,


### Business Logic:
  * Suppose an expression for getting the sum of two columns is `RCFD1111 + RCFD2222` and the column `RCFD2222` does not exist but `RCFD1111` does. The missing column should be treated as it exists but contains 0 as value.
  * Suppose an expression for getting the sum of two columns is `RCFD1111 + RCFD2222` and both `RCFD1111` and `RCFD2222` do not exist. This should result in a null value.
  * Suppose an expression for getting the qoutient of two columns `RCFD1111 / RCFD2222` and either the numerator or denominator column does not exist. The resulting value should be null.
  * Suppose an expression for getting the qoutient of two columns `RCFD1111 / RCFD2222` and the denominator column is zero. The resulting value should be null.

### Process (Do not use)
1. Get all the individual MDRM codes (not the calculated codes) defined in the metadata.
2. For each MDRM code that does not exist yet in Attic, create a column for that MDRM code with value equal to 0.
3. When pandas.eval evaluates an expression that contains a null value, it will result in a null value. Hence, 0 is used as a replacement for null.

### Process (Current)
1. Evaluate the numerator expression available in the metadata. 
2. If all of the MDRM code in the numerator does not exist yet in Attic, the overall expression should return null.
3. If at least one MDRM code in the numerator exist, fill the missing MDRM code with zero<sup>1</sup>.
5. If numerator is not null, evaluate the denominator expression available in the metadata.
4. If all of the MDRM code in the denominator does not exist yet in Attic, the overall expression should return null.
5. If at least one MDRM code in the denominator exist, fill the missing MDRM code with zero.
5. If the denominator result is 0, the overall expression should return null.

<sup>1</sup> When pandas.eval evaluates an expression that contains a null value, it will result in a null value. Hence, 0 is used as a replacement for null.




### Solutions
1. Create columns in the metadata for numerator and denominator
2. Break down the MDRM Code expression into Numerator and Denominator
3. Example 1: "a + b" will have numerator equal to "a + b" and denominator equal to "1"
4. Example 2: "(a + b)/c" will have a numerator equal to "a + b" and denominator equal to "c"


In [None]:
df = pd.DataFrame({"RCFD1111": [1, 2, 3], "RCFD2222": [1, 2, 3], "RCFD3333": [np.nan, 2, 3]})

pattern = r'[A-Z]{4}[A-Z0-9]{4}'
eval_string = "RCFD1111 + RCFD5555 / RCFD3333"
mdrm_columns = re.findall(pattern, eval_string)

# If at least one column exist in the database
if set(mdrm_columns) & set(df.columns):
    for column in mdrm_columns:
        if column not in df.columns:
            df[column] = np.nan

df

Unnamed: 0,RCFD1111,RCFD2222,RCFD3333,RCFD5555
0,1,1,,
1,2,2,2.0,
2,3,3,3.0,
