In [15]:
import pandas as pd
from collections import Counter

In [16]:
df = pd.read_csv('data_with_pimples.csv')

In [17]:
df_food = df[['Breakfast Food', 'Lunch Food', 'Dinner Food']]
df = df.drop(columns=['Breakfast Food', 'Lunch Food', 'Dinner Food'])
food_list = [
    'red food',
    'greens',
    'red meet',
    'white meet',
    'fish',
    'seafood',
    'gluten',
    'starch',
    'lactose',
    'other type of sugar',
    'nightshade (tomatoes, potatoes, eggplant, peppers)',
    'white sugar',
    'sweetener',
    'mushrooms',
    'fruits',
    'sweets',
]

In [18]:
def count_repeated_items(df, known_items):

    word_counts = []

    # Process each row in the DataFrame
    for index, row in df.iterrows():
        row_counter = Counter()
        for item in row:
            # Ensure the item is treated as a string
            words = str(item).split(', ')
            for word in words:
                # Count as 'Others' if not in known_foods
                if word not in known_items:
                    row_counter['others'] += 1
                else:
                    row_counter[word] += 1
        # Appending the count of each word or 'Others' for the current row to the word_counts list
        word_counts.append([row_counter.get(item, 0) for item in known_items + ['others']])

    # Creating a new DataFrame to hold the counts
    counts_df = pd.DataFrame(word_counts, columns=known_items + ['others'])

    return counts_df

counts_food = count_repeated_items(df_food, food_list)
df = pd.concat([df, counts_food], axis=1)

In [19]:
def categorize_column(df, column_name, list, prefix='', other_col=''):
    def check_keyword(item, keyword):
        return keyword in item

    for items_in_list in list:
        if items_in_list != 'Other:':
            new_column_name = prefix + items_in_list.replace(' ', '-')
            df[new_column_name] = df[column_name].apply(
                lambda x: check_keyword(x, items_in_list))

    # Add 'prefix-Other:' column
    df[other_col] = df[column_name].apply(lambda x: not any(
        check_keyword(x, items_in_list) for items_in_list in list))
    
    df = df.drop(columns=[column_name])

    return df


In [20]:
skincare = [
    "cleaner",
    "tonic",
    "cream",
    "serum"
]

df = categorize_column(df, 'Morning Routine', skincare, prefix='morning_skincare - ', other_col='morning_skincare_other')
df = categorize_column(df, 'Evening Skincare', skincare, prefix= 'evening_skincare - ', other_col='evening_skincare_other')

In [21]:
drinks = [
    'alcohol',
    'sparkling mineral water',
    'soda water (cola/pepsi/fanta/schweppes/other)',
    'no'
]

df = categorize_column(df, 'Other Drinks Intake', drinks, prefix= 'other_drinks - ', other_col='anther_drinks')

In [22]:
df = df.drop(columns="Today")
df['total_pimples'] = df['total_pimples'].fillna(0)

In [23]:
from sklearn.preprocessing import OrdinalEncoder

encoderOrdinalEncoder = OrdinalEncoder()

def to_ordinal_encoder(df, encoder, columns):

    encoded_data = encoder.fit_transform(df[columns])

    encoded_df = pd.DataFrame(encoded_data, columns=columns)

    for column in columns:
        df[column] = encoded_df[column]
    return df

In [24]:
encode_columns = [
    'Sunscreen', 'Water Intake', 'Tea Intake', 'Coffee Intake',
    'Milk Intake', 'Snacks', 'Supplements', 'Stress', 'Physical Activity',
    'Mask'
]

In [None]:
to_ordinal_encoder(df, encoderOrdinalEncoder, encode_columns)

In [26]:
# Convert boolean to numerical (False=0, True=1)
bool_columns = df.columns[df.dtypes == 'bool']
float_columns = df.columns[df.dtypes == 'float']
df[bool_columns] = df[bool_columns].astype(int)
df[float_columns] = df[float_columns].astype(int)

In [27]:
df.to_csv("data_full.csv", index=False)