# Functions

## Metadata Cleaning

In [5]:
def split_values(str_input, delimiter):
    """
    Purpose:
    Convert an input such as "25-50 g" to a single
    integer value that is the average of the numbers
    (37.5 in this case).
    
    Input:
        str_input = the string to convert
        delimiter = what to split the string by
        
    Output:
        mean_value
    """
    # Split by the delimiter
    sep_values = str_input.split(delimiter)
    
    # Convert each value to an integer
    sep_values = [int(val) for val in sep_values]
    
    # Calculate the mean value
    mean_value = mean(sep_values)
    
    return mean_value

In [6]:
def clean_pd(value):
    """
    Purpose:
    - Clean numerical 'X_pd' columns.
    - To be used with `apply()`.
    
    Pipeline:
    - Convert to a string for manipulation.
    - Strip non-numerical values.
    - Calculate averages as required.
    - Return an int value.
    """
    
    # Values which do not need cleaning
    try:
        return float(value)
    
    # Values which need cleaning
    except:
        # Convert value to a string
        value = str(value)

        ### REMOVE STRINGS ###
        # Check if 'week' is in the value
        if ' week' in value:
            
            # Variation #1: per week
            if ' per week' in value:
                # Remove the ' per week' string
                converted = value.replace(" per week", "").strip()

            # Variation #2: for week
            elif ' for week' in value:
                # Remove the ' for week' string
                converted = value.replace(" for week", "").strip()
            
            # Check for a dash
            if '/' in converted:
                result = split_values(converted, "/")
            else:
                result = converted
            
            # Get the per day value
            per_day = round(int(result)/7, 2)

            return per_day
        
        # Check if ' for mounth' is in the value
        elif ' for mounth' in value:
            
            # Remove the ' for mounth'
            converted = value.replace(" for mounth", "")
            
            # Check for a dash
            if '-' in converted:
                result = split_values(converted, "-")
            else:
                result = converted
            
            # Get the per day value
            per_day = round(float(result)/30,  2)
            
            return per_day
        
        # Check if '/ month' is in the value
        elif '/ month' in value:
            
            # Convert to a list delimited by a space
            space_list = value.split(" ")
            
            # Get the per day value
            per_day = int(space_list[0])/30
            
            return per_day
        
        # Check if 'g' is in the value:
        elif 'g' in value:
            
            # Find the index of 'g'
            g_index = value.find('g')
            
            # Strip the remainder and whitespace
            converted = value[:g_index].strip()
            
            # Check for delimiters
            if '-' in converted:
                result = split_values(converted, "-")
            elif '/' in converted:
                result = split_values(converted, "/")
            else:
                result = converted

            return result
        
        ### CALCULATE AVERAGES ###
        elif '/' in value:
            return split_values(value, "/")
        
        elif '-' in value:
            return split_values(value, "-")
        
        ### OTHER CLEANING ###
        elif ',' in value:
            return float(value.replace(",", "."))
        
        # Catchall
        else:
            return value

In [7]:
def imputation(df, cat_col, num_col):
    """
    Purpose:
    Check whether the missing data should be
    imputed with a mean or a median.
    
    If the absolute skew score > 0.5, MEDIAN.
    Else, not skewed, MEAN.
    
    Input:
    df = dataframe
    cat_col = categorical column
    num_col = numerical column
        
    Output:
        histogram
        skewness score
    """
    
    # Get the unique values from the categorical column
    unique_values = df[cat_col].unique()
    print(unique_values)
    
    # Loop through each unique value
    for unique in unique_values:
        
        # If the value is 'never', set the value to `0`
        df.loc[df[cat_col] == 'never', num_col] = 0
        
        # Get the non-null values per unique
        check_condition = (df[cat_col] == unique) & (~df[num_col].isnull())
        check_data = df.loc[check_condition, num_col]
        
        # Check for unique values with 'nan' values
        check_null = df.loc[df[cat_col] == unique, num_col].value_counts(dropna=False).index
        
        # Skip if all values valid
        if (np.nan not in check_null) or (len(check_data) == 0):
            continue
        
        else:
            # Convert values for plotting
            check_data = check_data.astype(float)
            
            # Histogram
            check_data.hist()
            
            # Clean title
            title = cat_col.replace("_", " ").title()
            plt.title(f'{title} ({unique})')
            plt.xlabel(f'{title} per Day')
            plt.ylabel('Frequency')
            
            # Calculate the skewness
            skew_score = skew(check_data)
            
            # Display the results
            plt.show()
            
            # Check the skew score
            if abs(skew_score) > 0.5: # skewed = use median
                chosen_value = df.loc[check_condition, num_col].astype(float).median()
                print(f'Skewness: {round(skew_score,2)}, use MEDIAN.')
                print(f'Median value for "{unique}": {round(chosen_value, 2)}')
            
            else:
                chosen_value = df.loc[check_condition, num_col].astype(float).mean()
                print(f'Skewness: {round(skew_score,2)}, use MEAN.')
                print(f'Mean value for "{unique}": {round(chosen_value, 2)}')
            
            # Round the chosen value
            rounded_value = round(chosen_value, 2)
            
            # Impute the missing values
            null_condition = (df[cat_col] == unique) & (df[num_col].isnull())
            df.loc[null_condition, num_col] = rounded_value

In [8]:
def fruit_gram(value):
    """
    Purpose:
    - Convert gram-value to number of fruits.
    - To be used with `apply()`.
    """
    
    # Define a standard serve
    std_serve = 150
    
    # Values which do not need cleaning
    try:
        # Look for values greater than 5
        if float(value) > 5:

            # Convert to number of fruits
            num_fruits = round(int(value)/std_serve, 2)

            return num_fruits
        
        # Catchall
        else:
            return float(value)
    
    except:
        # Look for 'gramme'
        if ' gramme' in value:
            converted = value.replace(" gramme", "").strip()

            # Convert to number of fruits
            num_fruits = round(int(converted)/std_serve, 2)

            return num_fruits

        # Catchall
        else:
            return value