In [17]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv("../data/profiles.csv")

The "diet" variable has 6 possible cuisines (anything, vegetarian, vegan, kosher, halal, and other) and 2 changers (mostly/strictly), which might create unnecessary dummy variables that won't add value to the model. So let's boil it down to diet type without the nuance.

In [49]:
df['diet'].replace(["mostly ", "strictly "], "", regex=True, inplace=True)
df['diet']

0          anything
1             other
2          anything
3        vegetarian
4               NaN
            ...    
59941           NaN
59942      anything
59943      anything
59944      anything
59945           NaN
Name: diet, Length: 59946, dtype: object

The same goes for the `language` variable. There are 3 modifiers for a multitude of languages. The permutation between all possible values will generate thousands of unnecessary columns. Let's just keep the languages.

In [53]:
df['speaks'].replace([" \(fluently\)", " \(okay\)", " \(poorly\)"], "", regex=True, inplace=True)
df["speaks"]

0                                            english
1                           english, spanish, french
2                               english, french, c++
3                                    english, german
4                                            english
                            ...                     
59941                                        english
59942                                        english
59943                                        english
59944    english, spanish, chinese, korean, japanese
59945                                        english
Name: speaks, Length: 59946, dtype: object

In [21]:
def create_dummy(df):
    """
    Creates dummy variables for all the categorial variables in a DataFrame and concatenates it
    with the original numerical columns.
    Input: pandas DataFrame
    Output: pandas DataFrame
    """
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(exclude="object").columns
    dummy_df = pd.get_dummies(df[cat_cols],
                              prefix=cat_cols,
                              prefix_sep="_",
                              dummy_na=True,
                              columns=cat_cols)
                              
    df_new = pd.concat([df[num_cols], dummy_df], axis=1)
    return df_new

In [22]:
df_no_essays = df.drop([f"essay{n}" for n in np.arange(0,10)], axis=1)
df_dummies = create_dummy(df_no_essays)
df_dummies.head()

Unnamed: 0,age,height,income,body_type_a little extra,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,body_type_full figured,body_type_jacked,...,"speaks_french (fluently), german (fluently), spanish (fluently), english","speaks_french (okay), english","speaks_portuguese (fluently), italian (okay), english (fluently), spanish (fluently)",speaks_nan,status_available,status_married,status_seeing someone,status_single,status_unknown,status_nan
0,22,75.0,-1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,35,70.0,80000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,68.0,-1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,23,71.0,20000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,29,66.0,-1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [51]:
set(df['speaks'])

{nan,
 'english (fluently), spanish (fluently), italian (okay), chinese (poorly), french (poorly)',
 'english (fluently), german (poorly), spanish (okay), italian (poorly)',
 'english (fluently), spanish (fluently), c++ (fluently), japanese (poorly)',
 'english, russian (poorly), japanese (poorly), french (okay)',
 'english, hebrew (poorly), japanese (okay), c++ (fluently)',
 'english (fluently), spanish (fluently), french (poorly), c++',
 'english (fluently), spanish (okay), korean (okay), french (poorly)',
 'english (fluently), bengali (fluently), german',
 'english (fluently), french (fluently), vietnamese (fluently)',
 'english, chinese (okay), german (okay)',
 'english (fluently), polish (fluently), spanish (poorly)',
 'english (fluently), croatian (fluently)',
 'english, italian (okay), spanish (fluently)',
 'english, russian (fluently), hebrew (poorly)',
 'english (fluently), italian (okay), french (okay)',
 'english (okay), latin (poorly), spanish (poorly)',
 'english, chinese,