In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv("../data/profiles.csv")

## Remove unneeded variables

First, let's drop all the columns we won't use.

In [None]:
df.drop(['last_online', 'location', 'sign', 'speaks'], axis=1, inplace=True)
df.drop([f"essay{n}" for n in np.arange(0,10)], axis=1, inplace=True)

## Refactor "diet" variable

The "diet" variable has 6 possible cuisines (anything, vegetarian, vegan, kosher, halal, and other) and 2 changers (mostly/strictly), which might create unnecessary dummy variables that won't add value to the model. So let's boil it down to diet type without the nuance.

In [None]:
df.dropna(axis=0, subset=['diet'], inplace=True)
df['vegetarian'] = df['diet'].apply(lambda x: 1 if ("vegetarian" in x) or ("vegan" in x) else 0)
df.drop("diet", axis=1, inplace=True)

In [None]:
def parse_to_dummies(df, column):
    """
    Transforms a column with comma-separated values into new 0-or-1 columns.
    """
    expanded_values = df[column].str.split(pat=", ", expand=True)
    # see https://stackoverflow.com/questions/26977076/pandas-unique-values-multiple-columns 
    unique_labels = pd.unique(expanded_values.values.ravel('K'))
    for label in unique_labels:
        if label == None:
            label = "None"
        df[f"{column}_{label}"] = df[column].apply(lambda x: 1 if label in x else 0)
    df.drop(column, axis=1, inplace=True)
    df.drop(f"{column}_None", axis=1, inplace=True)
    return df

## Convert "drink" values into a numerical scale

In [None]:
df['drinks'].replace({"not at all": 0,
                     "rarely": 1,
                     "socially": 2,
                     "often": 3,
                     "very often": 4,
                     "desperately": 5}, inplace=True)
df['drinks']

## Extract offspring

In [None]:
df['offspring'].replace(['doesn&rsquo;t have kids',
                         'doesn&rsquo;t have kids, and doesn&rsquo;t want any',
                         'doesn&rsquo;t have kids, but might want them',
                         'doesn&rsquo;t have kids, but wants them',
                         'might want kids',
                         'doesn&rsquo;t want kids',
                         'wants kids'], 0, inplace=True)

df['offspring'].replace(['has a kid',
                         'has a kid, and might want more',
                         'has a kid, and wants more',
                         'has a kid, but doesn&rsquo;t want more',
                         'has kids',
                         'has kids, and might want more',
                         'has kids, and wants more',
                         'has kids, but doesn&rsquo;t want more',], 1, inplace=True)

## Extract ethnicities

In [None]:
df.dropna(axis=0, subset=['ethnicity'], inplace=True)
df = parse_to_dummies(df, column="ethnicity") 

## Cleaning up religion

Let's remove the religion variable modifiers.

In [None]:
df['religion'].replace([" and laughing about it",
                       " and somewhat serious about it",
                       " and very serious about it",
                       " but not too serious about it"], "", regex=True, inplace=True)

In [None]:
df['religion']

## Extract pet information

In [None]:
df["pets"].fillna("Unknown", inplace=True)
df['dog_person'] = df['pets'].apply(lambda x: 1 if ("has dogs" in x) or ("likes dogs" in x) else 0)
df['cat_person'] = df['pets'].apply(lambda x: 1 if ("has cats" in x) or ("likes cats" in x) else 0)
df.drop('pets', axis=1, inplace=True)

## Create dummy variables from categorical columns

In [None]:
def create_dummy(df):
    """
    Creates dummy variables for all the categorial variables in a DataFrame and concatenates it
    with the original numerical columns.
    Input: pandas DataFrame
    Output: pandas DataFrame
    """
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(exclude="object").columns
    dummy_df = pd.get_dummies(df[cat_cols],
                              prefix=cat_cols,
                              prefix_sep="_",
                              dummy_na=True,
                              columns=cat_cols)
                              
    df_new = pd.concat([df[num_cols], dummy_df], axis=1)
    return df_new

In [None]:
df = create_dummy(df)

In [None]:
df

In [None]:
corr_table = df.corr()

In [None]:
df['offspring'].mean()

In [None]:
for x in df.columns:
    print(x)

In [None]:
df[df['drugs_often'] == 1].mean()

In [None]:
df['vegetarian'].value_counts()

In [None]:
df