In [117]:
# Examples on how to check and clean a dataframe - such as after loading from CSV


In [118]:
import pandas as pd
import logging
from datetime import date, timedelta
import numpy as np
import seaborn as sns

logging.basicConfig(level=logging.DEBUG, format="%(message)s")
LOGGER = logging.getLogger(__name__)
LOGGER.debug("log level = %s", str(LOGGER.getEffectiveLevel()))


#Columns that must be in the CSV. 
REQUIRED_COLUMNS=['label','text','int','end']

#Columns that must have non-empty values. 
REQUIRED_VALUES=['label','text','int']

df = pd.read_csv("./data/test_clean.csv", skipinitialspace=True)
df.head(12)


log level = 10


Unnamed: 0,label,text,int,end
0,r-empty-ok,,1.0,end
1,r-sp-ok,,2.0,end
2,r-leading-ok,leading_sp,3.0,end
3,r-ok-empty,val,,end
4,r-ok-spaces,val,,end
5,r-ok-leading,val,4.0,end
6,r-good,val,0.0,end
7,r-All Empty,,,
8,,Empty Label,,


In [121]:

def find_blanks(df:pd.DataFrame, required_columns:[str]=None, required_values:[str]=None,)->None:
    """
    Check and clean values in dataframe. Original df is returned unchanged or throws exception
    :param df:   raw dataframe.   
    :param required_columns: optional - Names of df columns that must be present - even if empty
    :param required_values: optional - Names of df columns that must have values.  Error if any of them have missing values. 
    :return:  None.   Throws exceptions if errors
    """
    if required_columns:
        missing_cols = set(required_columns) - set(df.columns)
        if missing_cols:
            raise Exception(f"Invalid CSV. Missing required columns: {', '.join(missing_cols)}")
    
    if required_values:
        df_req = df[required_values]
        
        num_null =  df_req.isnull().sum().sum()
        LOGGER.debug("Num empty required cells in dataframe: %d", num_null)
        
        if num_null:
            err = {row:required_values[col] for row, col in zip(*df_req.isnull().values.nonzero())}                        
            msg = f"Data error. Dataframe contained {num_null} empty cells in required columns {required_columns}. List shows the row index and column names of the FIRST ERROR IN ROWS with blanks.  {err}"
            LOGGER.warning(msg)            
            raise Exception(msg)

In [122]:
find_blanks(df,REQUIRED_COLUMNS, REQUIRED_VALUES)
df.head(20)

Num empty required cells in dataframe: 8
Data error. Dataframe contained 8 empty cells in required columns ['label', 'text', 'int', 'end']. List shows the row index and column names of the FIRST ERROR IN ROWS with blanks.  {0: 'text', 1: 'text', 3: 'int', 4: 'int', 7: 'int', 8: 'int'}


Exception: Data error. Dataframe contained 8 empty cells in required columns ['label', 'text', 'int', 'end']. List shows the row index and column names of the FIRST ERROR IN ROWS with blanks.  {0: 'text', 1: 'text', 3: 'int', 4: 'int', 7: 'int', 8: 'int'}

In [None]:
def trim_all(df:pd.DataFrame):
    df_trimmed = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    return df_trimmed

In [None]:
#Does trimming after this type of CSV load make a difference?  Answer NO. 
df_trimmed = trim_all(df)
comparison = df.compare(df_trimmed)
print(comparison)


In [85]:
def find_blanks(df):
    # Print locations of null cells
    print("Null cells:")
    for row, col in zip(*df.isnull().values.nonzero()):
        print(f"Row {row + 1}, Column {col + 1}")

    # Print locations of empty strings
    print("\nEmpty strings:")
    for row in df[df.eq('')].index:
        print(f"Row {row + 1}")

    # Print locations of spaces or nulls
#    print("\nSpaces or nulls:")
#    for row, col in zip(*df.str.strip().isnull().values.nonzero()):
#        print(f"Row {row + 1}, Column {col + 1}")

In [86]:
print("before - time_all")
print(df.head(20))
find_blanks(df)

before - time_all
          label        text  int  end
0    r-empty-ok         NaN  1.0  end
1       r-sp-ok         NaN  2.0  end
2  r-leading-ok  leading_sp  3.0  end
3    r-ok-empty         val  NaN  end
4   r-ok-spaces         val  NaN  end
5  r-ok-leading         val  4.0  end
6        r-good         val  0.0  end
7   r-All Empty         NaN  NaN  NaN
Null cells:
Row 1, Column 2
Row 2, Column 2
Row 4, Column 3
Row 5, Column 3
Row 8, Column 2
Row 8, Column 3
Row 8, Column 4

Empty strings:
Row 1
Row 2
Row 3
Row 4
Row 5
Row 6
Row 7
Row 8


In [None]:
print("after - trim_all")
df_trimmed = trim_all(df)
print(df_trimmed.head(20))
find_blanks(df_trimmed)

In [None]:
# Find rows with empty strings in any column
empty_rows = df[df.eq('').any(axis=1)]
print('rows with blanks')
print(empty_rows)

# Find columns with empty strings
empty_cols = df.columns[df.eq('').any()]
print('empty_cols')
print(empty_cols)
