# Bad-data cleaning functions:

Written by Brian Mattis 4/5/2022.  https://medium.com/@brian-mattis

*Import into your notebook after saving locally with:* \
**%run ColumnCleaner.ipynb**

In [6]:
import numpy as np
import pandas as pd

**char_finder()** - Find characters or booleans in a would-be numeric column. 

In [7]:
#functionalized.  usage: char_finder(df, 'num_col')
def char_finder(data_frame, series_name):
    cnt=0
    print(series_name)
    for row in data_frame[series_name]:
        try:
            float(row) #changed to float to not flag NaNs or decimals.  
            pass
        except ValueError:
            print(data_frame.loc[cnt, series_name], "-> at row:"+str(cnt))
        cnt+=1

**char_fixer()** - Remove characters or booleans from a would-be numeric column (NaNs maintained)

In [8]:
#functionalized.  usage: char_fixer(df, 'num_col')
def char_fixer(data_frame, series_name):
    cnt=0
    for row in data_frame[series_name]:
        try:
            float(row) #changed to float to not flag NaNs or decimals.  
            pass
        except ValueError:
            data_frame.drop([cnt], inplace=True) #or assign to some other desired value
        cnt+=1
    data_frame[series_name] = data_frame[series_name].astype('float64', errors='raise')
    data_frame.reset_index(drop=True, inplace=True)

**char_fixer_nan()** - Remove characters or booleans from a would-be numeric column (all bad set to NaN)

In [None]:
#functionalized.  usage: char_fixer_nan(df, 'num_col')
def char_fixer_nan(data_frame, series_name):
    cnt=0
    for row in data_frame[series_name]:
        try:
            float(row) #changed to float to not flag NaNs or decimals.  
            pass
        except ValueError:
            data_frame.loc[cnt, series_name] = np.nan
        cnt+=1
    data_frame[series_name] = data_frame[series_name].astype('float64', errors='raise')

**num_finder()** - Find numbers or booleans in would-be string (character) columns

In [9]:
#functionalized.  usage: num_finder(df, 'name_col')
def num_finder(data_frame, series_name):
    cnt=0
    for row in data_frame[series_name]:
        try:
            int(float(row)) # if no error, report found numeric in bottom 'else:'
        except ValueError:
            if row=='True' or row=='False':
                # report the True or False entries
                print(data_frame.loc[cnt, series_name], "-> at row:"+str(cnt))
            else: #Chars or NaNs
                pass
        else:
            print(data_frame.loc[cnt, series_name], "-> at row:"+str(cnt))
        cnt+=1

**num_fixer()** - Remove numbers or booleans in would-be string (character) columns (NaNs maintained)

In [1]:
#functionalized.  usage: num_fixer(df, 'name_col')
def num_fixer(data_frame, series_name):
    cnt=0
    for row in data_frame[series_name]:
        try:
            int(float(row)) # if no error, drop it in the bottom else:'
        except ValueError:
            if row=='True' or row=='False': #drop if would-be boolean
                data_frame.drop([cnt], inplace=True) 
            elif row=='nan': #turn the string 'nan' into NaN
                data_frame.loc[cnt, series_name] = np.nan
            else: #Chars or NaNs
                pass
        else:
            data_frame.drop([cnt], inplace=True) 
        cnt+=1
    data_frame[series_name] = data_frame[series_name].astype('string', errors='raise')
    data_frame.reset_index(drop=True, inplace=True)  

**num_fixer()** - Remove numbers or booleans in would-be string (character) columns (all bad set to NaN)

In [None]:
#functionalized.  usage: num_fixer_nan(df, 'name_col')
def num_fixer_nan(data_frame, series_name):
    cnt=0
    for row in data_frame[series_name]:
        try:
            int(float(row)) # if no error, drop it in the bottom else:'
        except ValueError:
            if row=='True' or row=='False': #drop if would-be boolean
                data_frame.loc[cnt, series_name] = np.nan
            elif row=='nan': #turn the string 'nan' into NaN
                data_frame.loc[cnt, series_name] = np.nan
            else: #Chars or NaNs
                pass
        else:
            data_frame.loc[cnt, series_name] = np.nan
        cnt+=1
    data_frame[series_name] = data_frame[series_name].astype('string', errors='raise')
    data_frame.reset_index(drop=True, inplace=True)