In [None]:
import polars as pl
from collections import Counter

In [None]:
df = pl.read_csv('data_with_pimples.csv')
df.columns

In [None]:
df_food = df.select(['breakfast', 'lunch', 'dinner'])
df = df.drop(['breakfast', 'lunch', 'dinner'])
food_list = [
    'red food',
    'greens',
    'red meat',
    'white meat',
    'fish',
    'seafood',
    'gluten',
    'starch',
    'lactose',
    'other type of sugar',
    'nightshade',
    'white sugar',
    'sweetener',
    'mushrooms',
    'fruits',
    'sweets',
    'eggs',
    'nothing',
]

In [None]:
def count_repeated_items(df, known_items):
    def count_row(row):
        row_counter = Counter()
        for item in row:
            words = str(item).split(', ')
            for word in words:
                if word not in known_items:
                    row_counter['others'] += 1
                else:
                    row_counter[word] += 1
        return [row_counter.get(item, 0) for item in known_items + ['others']]

    # Apply the count_row function to each row
    counts = df.select(pl.struct(df.columns).map_elements(count_row, return_dtype = pl.List(pl.Int64)))

    # Create a new DataFrame with the counts
    counts_df = pl.DataFrame(
        counts.to_series().to_list(),
        schema=known_items + ['others']
    )

    return counts_df

# Assuming food_list is defined
counts_food = count_repeated_items(df_food, food_list)

# Concatenate the original df with counts_food
df = pl.concat([df, counts_food], how='horizontal')

In [None]:
def categorize_column(df, column_name, keywords_list, prefix='', other_col=''):
    # Define a function to check if a keyword is in a given item
    def check_keyword(item, keyword):
        return keyword in item

    # Iterate over each keyword and create a new column
    for keyword in keywords_list:
        if keyword != 'Other:':
            new_column_name = prefix + keyword.replace(' ', '-')
            df = df.with_columns(
                (pl.col(column_name).map_elements(lambda x: check_keyword(x, keyword), return_dtype=pl.Boolean)).alias(new_column_name)
            )

    # Add a column for items that do not match any keyword
    df = df.with_columns(
        pl.col(column_name).map_elements(lambda x: not any(check_keyword(x, keyword) for keyword in keywords_list), return_dtype=pl.Boolean).alias(other_col)
    )

    # Drop the original column
    df = df.drop(column_name)

    return df


In [None]:
skincare = [
    "cleaner",
    "tonic",
    "serum",
    "eye cream",
    "cream",
    "patch",
    "eye patchs",
    "sunscreen",
    "pilling",
    "scrub",
    "spot cream",
    "mask",
    "skin picking(",
    "nothing",
]

df = categorize_column(df, 'morning_skincare', skincare, prefix='morning_skincare - ')
df = categorize_column(df, 'evening_skincare', skincare, prefix= 'evening_skincare - ')

In [None]:
drinks = [
    'alcohol',
    'energy drink',
    'juice',
    'soft drink',
    'sparkling mineral water',
    'no',
]

df = categorize_column(df, 'other_drinks', drinks, prefix= 'other_drinks - ')

In [None]:
milk = [
    'yes, cow',
    'yes, lactose free',
    'yes, alternative',
    'no',
]

df = categorize_column(df, 'milk_drinks', milk, prefix= 'milk_drinks - ')

In [None]:
hot_drinks = [
    'green tea',
    'black tea',
    'coffee',
    'herbal tea',
    'other',
    'no',
]

df = categorize_column(df, 'hot_drinks', hot_drinks, prefix= 'hot_drinks - ')

In [None]:
df = df.drop('date')
df = df.with_columns(
    pl.col('total_pimples').fill_null(0)
)

Since Polars doesn't have a built-in ordinal encoder like scikit-learn, we will implement a custom ordinal encoding function using Polars functionalities.
The function will apply encoding to specified columns and update the DataFrame.

In [None]:
def to_ordinal_encoder(df, columns):
    for column in columns:
        unique_values = df[column].unique().to_list()
        value_to_ordinal = {value: idx for idx, value in enumerate(unique_values)}

        # Create a new column with encoded values
        df = df.with_columns(
            df[column].replace(value_to_ordinal).alias(column)
        )
    return df

In [None]:
encode_columns = ['snacks', 'stress', 'supplements', 'water', 'workouts']

In [None]:
to_ordinal_encoder(df, encode_columns)

In [None]:
# Identify boolean columns
bool_columns = [col for col, dtype in df.schema.items() if dtype == pl.Boolean]

# Identify float columns
float_columns = [col for col, dtype in df.schema.items() if dtype == pl.Float32]


# Convert boolean columns to integers (0 and 1)
bool_columns_exprs = [pl.col(col).cast(pl.Int32).alias(col) for col in bool_columns]

# Convert float columns to integers
float_columns_exprs = [pl.col(col).cast(pl.Int32).alias(col) for col in float_columns]

# Apply the transformations to the DataFrame
df = df.with_columns(bool_columns_exprs + float_columns_exprs)


In [None]:
df.write_csv("data_full.csv")
df