In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/profiles.csv")

The "diet" variable has 6 possible cuisines (anything, vegetarian, vegan, kosher, halal, and other) and 2 changers (mostly/strictly), which might create unnecessary dummy variables that won't add value to the model. So let's boil it down to diet type without the nuance.

In [3]:
df['diet'].replace(["mostly ", "strictly "], "", regex=True, inplace=True)
df['diet']

0          anything
1             other
2          anything
3        vegetarian
4               NaN
            ...    
59941           NaN
59942      anything
59943      anything
59944      anything
59945           NaN
Name: diet, Length: 59946, dtype: object

The same goes for the `language` variable. There are 3 modifiers for a multitude of languages. The permutation between all possible values will generate thousands of unnecessary columns. Let's just keep the languages.

In [4]:
df['speaks'].replace([" \(fluently\)", " \(okay\)", " \(poorly\)"], "", regex=True, inplace=True)
df["speaks"]

0                                            english
1                           english, spanish, french
2                               english, french, c++
3                                    english, german
4                                            english
                            ...                     
59941                                        english
59942                                        english
59943                                        english
59944    english, spanish, chinese, korean, japanese
59945                                        english
Name: speaks, Length: 59946, dtype: object

But filtering out the language modifiers isn't enough. There's only one column for all the user's spoken languages, whose combination might generate a dozen thousands dummy variables. Therefore, each `speaks` value should be parsed and gain a dummy variable of its own.

In [14]:
lang_table = df['speaks'].str.split(pat=", ", expand=True)
speaks_table = pd.get_dummies(lang_table, prefix="speaks")
df_no_speaks = df.drop(["speaks"], axis=1)
df_languages = pd.concat([df_no_speaks, speaks_table], axis=1)

In [15]:
def create_dummy(df):
    """
    Creates dummy variables for all the categorial variables in a DataFrame and concatenates it
    with the original numerical columns.
    Input: pandas DataFrame
    Output: pandas DataFrame
    """
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(exclude="object").columns
    dummy_df = pd.get_dummies(df[cat_cols],
                              prefix=cat_cols,
                              prefix_sep="_",
                              dummy_na=True,
                              columns=cat_cols)
                              
    df_new = pd.concat([df[num_cols], dummy_df], axis=1)
    return df_new

In [16]:
df_no_essays = df_languages.drop([f"essay{n}" for n in np.arange(0,10)], axis=1)
df_dummies = create_dummy(df_no_essays)
df_dummies.head()

Unnamed: 0,age,height,income,speaks_afrikaans,speaks_afrikaans.1,speaks_afrikaans.2,speaks_afrikaans.3,speaks_afrikaans.4,speaks_english,speaks_english.1,...,smokes_trying to quit,smokes_when drinking,smokes_yes,smokes_nan,status_available,status_married,status_seeing someone,status_single,status_unknown,status_nan
0,22,75.0,-1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,35,70.0,80000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,38,68.0,-1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,23,71.0,20000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,29,66.0,-1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
