In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv("../data/profiles.csv")

## Remove unneeded variables

First, let's drop all the columns we won't use.

In [3]:
df.drop(['last_online', 'location', 'sign'], axis=1, inplace=True)
df.drop([f"essay{n}" for n in np.arange(0,10)], axis=1, inplace=True)

## Refactor "diet" variable

The "diet" variable has 6 possible cuisines (anything, vegetarian, vegan, kosher, halal, and other) and 2 changers (mostly/strictly), which might create unnecessary dummy variables that won't add value to the model. So let's boil it down to diet type without the nuance.

In [4]:
df.dropna(axis=0, subset=['diet'], inplace=True)
df['vegetarian'] = df['diet'].apply(lambda x: 1 if ("vegetarian" in x) or ("vegan" in x) else 0)
df.drop("diet", axis=1, inplace=True)

## Parse spoken languages

The same goes for the `language` variable. There are 3 modifiers for a multitude of languages. The permutation between all possible values will generate thousands of unnecessary columns. Let's just keep the languages.

In [5]:
df['speaks'].replace([" \(fluently\)", " \(okay\)", " \(poorly\)"], "", regex=True, inplace=True)
df["speaks"]

0                                            english
1                           english, spanish, french
2                               english, french, c++
3                                    english, german
5                                   english, chinese
                            ...                     
59936                               english, chinese
59937                               english, spanish
59942                                        english
59943                                        english
59944    english, spanish, chinese, korean, japanese
Name: speaks, Length: 35551, dtype: object

But filtering out the language modifiers isn't enough. There's only one column for all the user's spoken languages, whose combination might generate a dozen thousands dummy variables. Therefore, each `speaks` value should be parsed and gain a dummy variable of its own.

In [6]:
def parse_to_dummies(df, column):
    """
    Transforms a column with comma-separated values into new 0-or-1 columns.
    """
    expanded_values = df[column].str.split(pat=", ", expand=True)
    # see https://stackoverflow.com/questions/26977076/pandas-unique-values-multiple-columns 
    unique_labels = pd.unique(expanded_values.values.ravel('K'))
    for label in unique_labels:
        if label == None:
            label = "None"
        df[f"{column}_{label}"] = df[column].apply(lambda x: 1 if label in x else 0)
    df.drop(column, axis=1, inplace=True)
    df.drop(f"{column}_None", axis=1, inplace=True)
    return df

df.dropna(axis=0, subset=["speaks"], inplace=True)
df = parse_to_dummies(df, column="speaks")

## Convert "drink" values into a numerical scale

In [7]:
df['drinks'].replace({"not at all": 0,
                     "rarely": 1,
                     "socially": 2,
                     "often": 3,
                     "very often": 4,
                     "desperately": 5}, inplace=True)
df['drinks']

0        2.0
1        3.0
2        2.0
3        2.0
5        2.0
        ... 
59936    2.0
59937    2.0
59942    3.0
59943    0.0
59944    2.0
Name: drinks, Length: 35530, dtype: float64

## Extract offspring

In [8]:
df['offspring'].replace(['doesn&rsquo;t have kids',
                         'doesn&rsquo;t have kids, and doesn&rsquo;t want any',
                         'doesn&rsquo;t have kids, but might want them',
                         'doesn&rsquo;t have kids, but wants them',
                         'might want kids',
                         'doesn&rsquo;t want kids',
                         'wants kids'], 0, inplace=True)

df['offspring'].replace(['has a kid',
                         'has a kid, and might want more',
                         'has a kid, and wants more',
                         'has a kid, but doesn&rsquo;t want more',
                         'has kids',
                         'has kids, and might want more',
                         'has kids, and wants more',
                         'has kids, but doesn&rsquo;t want more',], 1, inplace=True)

## Extract ethnicities

In [9]:
df.dropna(axis=0, subset=['ethnicity'], inplace=True)
df = parse_to_dummies(df, column="ethnicity") 

## Cleaning up religion

Let's remove the religion variable modifiers.

In [10]:
df['religion'].replace([" and laughing about it",
                       " and somewhat serious about it",
                       " and very serious about it",
                       " but not too serious about it"], "", regex=True, inplace=True)

In [11]:
df['religion']

0         agnosticism
1         agnosticism
3                 NaN
5             atheism
6                 NaN
             ...     
59936         atheism
59937         judaism
59942     agnosticism
59943    christianity
59944     agnosticism
Name: religion, Length: 32805, dtype: object

## Extract pet information

In [12]:
df["pets"].fillna("Unknown", inplace=True)
df['dog_person'] = df['pets'].apply(lambda x: 1 if ("has dogs" in x) or ("likes dogs" in x) else 0)
df['cat_person'] = df['pets'].apply(lambda x: 1 if ("has cats" in x) or ("likes cats" in x) else 0)
df.drop('pets', axis=1, inplace=True)

## Create dummy variables from categorical columns

In [13]:
def create_dummy(df):
    """
    Creates dummy variables for all the categorial variables in a DataFrame and concatenates it
    with the original numerical columns.
    Input: pandas DataFrame
    Output: pandas DataFrame
    """
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(exclude="object").columns
    dummy_df = pd.get_dummies(df[cat_cols],
                              prefix=cat_cols,
                              prefix_sep="_",
                              dummy_na=True,
                              columns=cat_cols)
                              
    df_new = pd.concat([df[num_cols], dummy_df], axis=1)
    return df_new

In [14]:
df = create_dummy(df)

In [15]:
df

Unnamed: 0,age,drinks,height,income,offspring,vegetarian,speaks_english,speaks_spanish,speaks_french,speaks_german,...,smokes_trying to quit,smokes_when drinking,smokes_yes,smokes_nan,status_available,status_married,status_seeing someone,status_single,status_unknown,status_nan
0,22,2.0,75.0,-1,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,35,3.0,70.0,80000,0.0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
3,23,2.0,71.0,20000,0.0,1,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5,29,2.0,67.0,-1,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,32,2.0,65.0,-1,,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59936,25,2.0,61.0,-1,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
59937,32,2.0,69.0,-1,,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
59942,24,3.0,72.0,-1,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
59943,42,0.0,71.0,100000,0.0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [16]:
corr_table = df.corr()

In [17]:
df['offspring'].mean()

0.16743174679874365

In [18]:
for x in df.columns:
    print(x)

age
drinks
height
income
offspring
vegetarian
speaks_english
speaks_spanish
speaks_french
speaks_german
speaks_chinese
speaks_sign language
speaks_tagalog
speaks_other
speaks_indonesian
speaks_swedish
speaks_belarusan
speaks_japanese
speaks_farsi
speaks_hindi
speaks_afrikaans
speaks_c++
speaks_russian
speaks_polish
speaks_croatian
speaks_italian
speaks_vietnamese
speaks_portuguese
speaks_latin
speaks_czech
speaks_greek
speaks_norwegian
speaks_hebrew
speaks_korean
speaks_esperanto
speaks_tibetan
speaks_thai
speaks_swahili
speaks_turkish
speaks_tamil
speaks_lisp
speaks_sanskrit
speaks_arabic
speaks_hungarian
speaks_urdu
speaks_romanian
speaks_finnish
speaks_bulgarian
speaks_yiddish
speaks_dutch
speaks_irish
speaks_persian
speaks_gujarati
speaks_khmer
speaks_hawaiian
speaks_danish
speaks_cebuano
speaks_ilongo
speaks_icelandic
speaks_serbian
speaks_ancient greek
speaks_albanian
speaks_catalan
speaks_occitan
speaks_mongolian
speaks_malay
speaks_rotuman
speaks_bengali
speaks_slovak
speaks_ge

In [19]:
df[df['drugs_often'] == 1].mean()

age                          25.717557
drinks                        2.533074
height                       69.423664
income                   102289.511450
offspring                     0.085714
                             ...      
status_married                0.003817
status_seeing someone         0.041985
status_single                 0.904580
status_unknown                0.000000
status_nan                    0.000000
Length: 195, dtype: float64

In [21]:
df['vegetarian'].value_counts()

0    27747
1     5058
Name: vegetarian, dtype: int64

In [None]:
df