In [1]:
import pandas as pd
from collections import Counter

In [None]:
df = pd.read_csv('data_with_pimples.csv')
df.columns

In [3]:
df_food = df[['breakfast', 'lunch', 'dinner']]
df = df.drop(columns=['breakfast', 'lunch', 'dinner'])
food_list = [
    'red food',
    'greens',
    'red meat',
    'white meat',
    'fish',
    'seafood',
    'gluten',
    'starch',
    'lactose',
    'other type of sugar',
    'nightshade',
    'white sugar',
    'sweetener',
    'mushrooms',
    'fruits',
    'sweets',
    'eggs',
    'nothing',
]

In [4]:
def count_repeated_items(df, known_items):

    word_counts = []

    # Process each row in the DataFrame
    for index, row in df.iterrows():
        row_counter = Counter()
        for item in row:
            # Ensure the item is treated as a string
            words = str(item).split(', ')
            for word in words:
                # Count as 'Others' if not in known_foods
                if word not in known_items:
                    row_counter['others'] += 1
                else:
                    row_counter[word] += 1
        # Appending the count of each word or 'Others' for the current row to the word_counts list
        word_counts.append([row_counter.get(item, 0) for item in known_items + ['others']])

    # Creating a new DataFrame to hold the counts
    counts_df = pd.DataFrame(word_counts, columns=known_items + ['others'])

    return counts_df

counts_food = count_repeated_items(df_food, food_list)
df = pd.concat([df, counts_food], axis=1)

In [5]:
def categorize_column(df, column_name, list, prefix='', other_col=''):
    def check_keyword(item, keyword):
        return keyword in item

    for items_in_list in list:
        if items_in_list != 'Other:':
            new_column_name = prefix + items_in_list.replace(' ', '-')
            df[new_column_name] = df[column_name].apply(
                lambda x: check_keyword(x, items_in_list))

    # Add 'prefix-Other:' column
    df[other_col] = df[column_name].apply(lambda x: not any(
        check_keyword(x, items_in_list) for items_in_list in list))
    
    df = df.drop(columns=[column_name])

    return df


In [6]:
skincare = [
    "cleaner",
    "tonic",
    "serum",
    "eye cream",
    "cream",
    "patch",
    "eye patchs",
    "sunscreen",
    "pilling",
    "scrub",
    "spot cream",
    "mask",
    "skin picking(",
    "nothing",
]

df = categorize_column(df, 'morning_skincare', skincare, prefix='morning_skincare - ')
df = categorize_column(df, 'evening_skincare', skincare, prefix= 'evening_skincare - ')

In [7]:
drinks = [
    'alcohol',
    'energy drink',
    'juice',
    'soft drink',
    'sparkling mineral water',
    'no',
]

df = categorize_column(df, 'other_drinks', drinks, prefix= 'other_drinks - ')

In [8]:
milk = [
    'yes, cow',
    'yes, lactose free',
    'yes, alternative',
    'no',
]

df = categorize_column(df, 'milk_drinks', milk, prefix= 'milk_drinks - ')

In [9]:
hot_drinks = [
    'green tea',
    'black tea',
    'coffee',
    'herbal tea',
    'other',
    'no',
]

df = categorize_column(df, 'hot_drinks', hot_drinks, prefix= 'hot_drinks - ')

In [10]:
df = df.drop(columns="date")
df['total_pimples'] = df['total_pimples'].fillna(0)

In [11]:
from sklearn.preprocessing import OrdinalEncoder

encoderOrdinalEncoder = OrdinalEncoder()

def to_ordinal_encoder(df, encoder, columns):

    encoded_data = encoder.fit_transform(df[columns])

    encoded_df = pd.DataFrame(encoded_data, columns=columns)

    for column in columns:
        df[column] = encoded_df[column]
    return df

In [12]:
encode_columns = ['snacks', 'stress', 'supplements', 'water', 'workouts']

In [None]:
to_ordinal_encoder(df, encoderOrdinalEncoder, encode_columns)

In [14]:
# Convert boolean to numerical (False=0, True=1)
bool_columns = df.columns[df.dtypes == 'bool']
float_columns = df.columns[df.dtypes == 'float']
df[bool_columns] = df[bool_columns].astype(int)
df[float_columns] = df[float_columns].astype(int)

In [None]:
df.to_csv("data_full.csv", index=False)
df