# Functions

In [5]:
def split_values(str_input, delimiter):
    """
    The purpose of this function is to convert an input
    such as "25-50 g" to a single integer value that is
    the average of the numbers (37.5 in this case).
    
    Input:
        str_input = the string to convert
        delimiter = what to split the string by
        
    Output:
        mean_value
    """
    # Split by the delimiter
    sep_values = str_input.split(delimiter)
    
    # Convert each value to an integer
    sep_values = [int(val) for val in sep_values]
    
    # Calculate the mean value
    mean_value = mean(sep_values)
    
    return mean_value

In [6]:
# Create a function that will convert the values
def convert_perday(value):
    """
    The purpose of this function is to clean non-null
    values in the metadata_df columns.
    
    This function is used with 'apply'.
    """
    
    # Convert value to a string
    value = str(value)
    
    # Check if 'week' is in the value
    if ' week' in value:
        
        # Variation #1
        if ' per week' in value:
            # Remove the ' per week' string and convert to int
            converted = value.replace(" per week", "")
        
        # Variation #2
        elif ' for week' in value:
            # Remove the ' for week' string and convert to int
            converted = value.replace(" for week", "")

        # Get the per day value
        per_day = round(int(converted)/7, 1)

        return per_day

    # Check values with a comma
    elif ',' in value:
        # Replace with a full stop and convert to int
        return value.replace(",", ".")

    # Check values with a fraction
    elif '/' in value:
        return split_values(value, "/")
    
    # Check for `mounth`
    elif ' for mounth' in value:
        # Remove the ' for mounth'
        converted = value.replace(" for mounth", "")
        
        # Check for a dash
        if '-' in converted:
            return split_values(converted, "-")

        
        else:
            return converted
    
    # Catchall
    else:
        return value

In [2]:
def imputation_check(df, cat_col, num_col):
    """
    The purpose of this function is to check whether the missing
    data should be imputed with a mean or median.
    
    If the absolute skew score > 0.5 use: MEDIAN
    Else, not skewed, use: MEAN
    
    Input:
        df = dataframe
        cat_col = categorical column
        num_col = numerical column
        
    Output:
        histogram
        skewness score
    """
    # Get the unique values from the categorical column
    unique_values = df[cat_col].unique()
    
    # Loop through each unique value
    for unique in unique_values:
        
        # Get the non-null values per unique
        check_condition = (df[cat_col] == unique) & (df[num_col] != 'nan')
        check_data = df.loc[check_condition, num_col]
        
        # Skip if the array is empty
        if len(check_data) == 0:
            continue
        
        else:
            # Convert values for plotting
            check_data = check_data.astype(float)

            # Histogram
            check_data.hist()
            
            # Clean title
            title = cat_col.replace("_", " ").title()
            plt.title(f'{title} ({unique})')
            plt.xlabel(f'{title} per Day')
            plt.ylabel('Frequency')
            
            # Calculate the skewness
            skew_score = skew(check_data)
            
            # Display the results
            plt.show()
            print(f'Skewness: {skew_score}')
            
            # Check the skew score
            if abs(skew_score) > 0.5: # skewed = use median
                chosen_value = df.loc[check_condition, num_col].astype(float).median()
                print(f'Median value for "{unique}": {chosen_value}')
            
            else:
                chosen_value = df.loc[check_condition, num_col].astype(float).mean()
                print(f'Mean value for "{unique}": {chosen_value}')
            
            # Round the chosen value
            rounded_value = round(chosen_value, 2)
            
            # Impute the missing values
            null_condition = (df[cat_col] == unique) & (df[num_col] == 'nan')
            df.loc[null_condition, num_col] = rounded_value
            
            # If the value is 'never', set the value to `0`
            df.loc[df[cat_col] == 'never', num_col] = 0

In [7]:
# Create a function that will clean the grams values
def convert_grams(value):
    """
    The purpose of this function is to clean non-null
    values in the columns which contain "grams".
    
    This function is used with 'apply'.
    """
    
    # Convert value to a string
    value = str(value)
    
    # Check if `g` is in the value
    if 'g' in value:
        
        # Variation #1
        if 'gr' in value:
            # Remove the ' gr' string
            converted = value.replace(" gr", "")
            
            # Check for a dash
            if '-' in converted:
                return split_values(converted, "-")
            
            else:
                return converted
        
        # Variation #2
        elif 'g' in value:
            # Remove the `g` in the string and strip whitespaces
            converted = value.replace("g", "").strip()
            
            # Check for dash
            if '-' in converted:
                return split_values(converted, "-")
    
    # Catchall
    else:
        return value