In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../data/profiles.csv")

First, let's drop all the columns we won't use.

In [None]:
df.drop(['last_online'], axis=1, inplace=True)
df.drop([f"essay{n}" for n in np.arange(0,10)], axis=1, inplace=True)

The "diet" variable has 6 possible cuisines (anything, vegetarian, vegan, kosher, halal, and other) and 2 changers (mostly/strictly), which might create unnecessary dummy variables that won't add value to the model. So let's boil it down to diet type without the nuance.

In [None]:
df['diet'].replace(["mostly ", "strictly "], "", regex=True, inplace=True)
df['diet']

The same goes for the `language` variable. There are 3 modifiers for a multitude of languages. The permutation between all possible values will generate thousands of unnecessary columns. Let's just keep the languages.

In [None]:
df['speaks'].replace([" \(fluently\)", " \(okay\)", " \(poorly\)"], "", regex=True, inplace=True)
df["speaks"]

But filtering out the language modifiers isn't enough. There's only one column for all the user's spoken languages, whose combination might generate a dozen thousands dummy variables. Therefore, each `speaks` value should be parsed and gain a dummy variable of its own.

In [None]:
lang_table = df['speaks'].str.split(pat=", ", expand=True)
speaks_table = pd.get_dummies(lang_table, prefix="speaks")
df_no_speaks = df.drop(["speaks"], axis=1)
df_languages = pd.concat([df_no_speaks, speaks_table], axis=1)

In [None]:
def create_dummy(df):
    """
    Creates dummy variables for all the categorial variables in a DataFrame and concatenates it
    with the original numerical columns.
    Input: pandas DataFrame
    Output: pandas DataFrame
    """
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(exclude="object").columns
    dummy_df = pd.get_dummies(df[cat_cols],
                              prefix=cat_cols,
                              prefix_sep="_",
                              dummy_na=True,
                              columns=cat_cols)
                              
    df_new = pd.concat([df[num_cols], dummy_df], axis=1)
    return df_new

In [None]:
df_dummies = create_dummy(df_languages)
df_dummies.head()