In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("../data/profiles.csv")

## Remove unneeded variables

First, let's drop all the columns we won't use.

In [15]:
df.drop(['last_online', 'location', 'sign'], axis=1, inplace=True)
df.drop([f"essay{n}" for n in np.arange(0,10)], axis=1, inplace=True)

## Refactor "diet" variable

The "diet" variable has 6 possible cuisines (anything, vegetarian, vegan, kosher, halal, and other) and 2 changers (mostly/strictly), which might create unnecessary dummy variables that won't add value to the model. So let's boil it down to diet type without the nuance.

In [16]:
df['diet'].replace(["mostly ", "strictly "], "", regex=True, inplace=True)
df['diet']

0          anything
1             other
2          anything
3        vegetarian
4               NaN
            ...    
59941           NaN
59942      anything
59943      anything
59944      anything
59945           NaN
Name: diet, Length: 59946, dtype: object

## Parse spoken languages

The same goes for the `language` variable. There are 3 modifiers for a multitude of languages. The permutation between all possible values will generate thousands of unnecessary columns. Let's just keep the languages.

In [17]:
df['speaks'].replace([" \(fluently\)", " \(okay\)", " \(poorly\)"], "", regex=True, inplace=True)
df["speaks"]

0                                            english
1                           english, spanish, french
2                               english, french, c++
3                                    english, german
4                                            english
                            ...                     
59941                                        english
59942                                        english
59943                                        english
59944    english, spanish, chinese, korean, japanese
59945                                        english
Name: speaks, Length: 59946, dtype: object

But filtering out the language modifiers isn't enough. There's only one column for all the user's spoken languages, whose combination might generate a dozen thousands dummy variables. Therefore, each `speaks` value should be parsed and gain a dummy variable of its own.

In [18]:
def parse_to_dummies(df, column):
    """
    Transforms a column with comma-separated values into new 0-or-1 columns.
    """
    expanded_values = df[column].str.split(pat=", ", expand=True)
    # see https://stackoverflow.com/questions/26977076/pandas-unique-values-multiple-columns 
    unique_labels = pd.unique(expanded_values.values.ravel('K'))
    table_shape = (df.shape[0], len(unique_labels))
    dummy_table = pd.DataFrame(np.zeros(table_shape, dtype="int"),
                              columns=[f"{column}_{label}" for label in unique_labels])
    for index,row in expanded_values.iterrows():
        for item in row:
            dummy_table.at[index, f"{column}_{item}"] = 1
   
    dummy_table.drop([f"{column}_None"], axis=1, inplace=True)
    df_drop_column = df.drop(column, axis=1)
    return pd.concat([df_drop_column, dummy_table], axis=1)

df = parse_to_dummies(df, column="speaks")

## Convert "drink" values into a numerical scale

In [19]:
df['drinks'].replace({"not at all": 0,
                     "rarely": 1,
                     "socially": 2,
                     "often": 3,
                     "very often": 4,
                     "desperately": 5}, inplace=True)
df['drinks']

0        2.0
1        3.0
2        2.0
3        2.0
4        2.0
        ... 
59941    2.0
59942    3.0
59943    0.0
59944    2.0
59945    2.0
Name: drinks, Length: 59946, dtype: float64

## Extract offspring

In [20]:
df['offspring'].replace(['doesn&rsquo;t have kids',
                         'doesn&rsquo;t have kids, and doesn&rsquo;t want any',
                         'doesn&rsquo;t have kids, but might want them',
                         'doesn&rsquo;t have kids, but wants them',
                         'might want kids',
                         'doesn&rsquo;t want kids',
                         'wants kids'], 0, inplace=True)

df['offspring'].replace(['has a kid',
                         'has a kid, and might want more',
                         'has a kid, and wants more',
                         'has a kid, but doesn&rsquo;t want more',
                         'has kids',
                         'has kids, and might want more',
                         'has kids, and wants more',
                         'has kids, but doesn&rsquo;t want more',], 1, inplace=True)

## Extract ethnicities

In [21]:
df = parse_to_dummies(df, column="ethnicity") 

## Cleaning up religion

Let's remove the religion variable modifiers.

In [None]:
df['religion'].replace([" and laughing about it",
                       " somewhat serious about it",
                       " and very serious about it",
                       " but not too serious about it"], "", inplace=True)

## Extract pet information

In [45]:
df["pets"].fillna("Unknown", inplace=True)
df['dog_person'] = df['pets'].apply(lambda x: 1 if ("has dogs" in x) or ("likes dogs" in x) else 0)
df['cat_person'] = df['pets'].apply(lambda x: 1 if ("has cats" in x) or ("likes cats" in x) else 0)

## Create dummy variables from categorical columns

In [10]:
def create_dummy(df):
    """
    Creates dummy variables for all the categorial variables in a DataFrame and concatenates it
    with the original numerical columns.
    Input: pandas DataFrame
    Output: pandas DataFrame
    """
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(exclude="object").columns
    dummy_df = pd.get_dummies(df[cat_cols],
                              prefix=cat_cols,
                              prefix_sep="_",
                              dummy_na=True,
                              columns=cat_cols)
                              
    df_new = pd.concat([df[num_cols], dummy_df], axis=1)
    return df_new

In [11]:
df = create_dummy(df)
df.head()

Unnamed: 0,age,drinks,height,income,offspring,speaks_english,speaks_nan,speaks_afrikaans,speaks_french,speaks_portuguese,...,smokes_trying to quit,smokes_when drinking,smokes_yes,smokes_nan,status_available,status_married,status_seeing someone,status_single,status_unknown,status_nan
0,22,2.0,75.0,-1,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,35,3.0,70.0,80000,0.0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,38,2.0,68.0,-1,,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,23,2.0,71.0,20000,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,29,2.0,66.0,-1,,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
for x in df.columns:
    print(x)

age
drinks
height
income
offspring
speaks_english
speaks_nan
speaks_afrikaans
speaks_french
speaks_portuguese
speaks_spanish
speaks_german
speaks_chinese
speaks_sign language
speaks_c++
speaks_tagalog
speaks_other
speaks_russian
speaks_dutch
speaks_indonesian
speaks_swedish
speaks_belarusan
speaks_japanese
speaks_farsi
speaks_italian
speaks_hindi
speaks_polish
speaks_korean
speaks_czech
speaks_croatian
speaks_vietnamese
speaks_esperanto
speaks_latin
speaks_greek
speaks_norwegian
speaks_hebrew
speaks_arabic
speaks_tibetan
speaks_georgian
speaks_thai
speaks_swahili
speaks_khmer
speaks_turkish
speaks_tamil
speaks_lisp
speaks_serbian
speaks_sanskrit
speaks_bengali
speaks_catalan
speaks_hungarian
speaks_irish
speaks_urdu
speaks_romanian
speaks_finnish
speaks_bulgarian
speaks_ancient greek
speaks_yiddish
speaks_hawaiian
speaks_lithuanian
speaks_cebuano
speaks_persian
speaks_maori
speaks_danish
speaks_gujarati
speaks_albanian
speaks_frisian
speaks_ilongo
speaks_icelandic
speaks_slovenian
spea