# Import Library

In [None]:
# data manipulation
import pandas as pd
import numpy as np

# Load Dataset

In [None]:
# load data
result = pd.read_csv("okcupid_profiles_cleaned.csv")

# Data Process

## Create New DataFrame

In [None]:
# create new dataframe
new_df = pd.DataFrame()

## Transform or Remain unchange

In [None]:
# add columns to new dataframe
new_df[['age', 'status', 'sex', 'orientation','body_type'] ] = result[['age', 'status', 'sex', 'orientation', 'body_type']]

In [None]:
# create new column for diet
new_df['diet'] = result['diet'].apply(lambda x: 'anything' if 'anything' in x
                                   else 'vegetarian' if ('vegetarian' in x or 'vegan' in x)
                                   else 'halal' if 'halal' in x
                                   else 'kosher' if 'kosher' in x
                                   else 'other')
# create new column for drinks
new_df[['drinks', 'drugs']] = result [['drinks', 'drugs']]

In [None]:
# create new column for education
dict_education = {
    'working on college/university': 'college',
    'graduated from college/university': 'college',
    'working on two-year college': 'college',
    'college/university': 'college',
    'two-year college': 'college',
    'graduated from two-year college': 'college',
    'dropped out of college/university': 'college',
    'dropped out of two-year college': 'college',
    'graduated from masters program': 'master',
    'working on masters program': 'master',
    'masters program': 'master',
    'dropped out of masters program': 'master',
    'graduated from ph.d program': 'phd',
    'working on ph.d program': 'phd',
    'dropped out of ph.d program': 'phd',
    'ph.d program': 'phd',
    'graduated from law school': 'law_school',
    'working on law school': 'law_school',
    'law school': 'law_school',
    'dropped out of law school': 'law_school',
    'working on space camp': 'space_camp',
    'graduated from space camp': 'space_camp',
    'dropped out of space camp': 'space_camp',
    'space camp': 'space_camp',
    'working on med school': 'med_school',
    'graduated from med school': 'med_school',
    'dropped out of med school': 'med_school',
    'med school': 'med_school',
    'graduated from high school': 'high_school',
    'dropped out of high school': 'high_school',
    'working on high school': 'high_school',
    'high school': 'high_school'}

# map education
def map_education(education):
    for key, value in dict_education.items():
        if education in key:
            return value
    return 'other'

# create new column for education
new_df['education'] = result['education'].apply(map_education)

In [None]:
# split ethnicity into multiple columns
ethnicity_split_series = pd.Series(result['ethnicity'].unique()).dropna().str.split(', ', expand=True)
# get all parts
ethnicity_all_parts = pd.Series(ethnicity_split_series.values.ravel()).dropna()
# get unique values
ethnicity_unique_series = ethnicity_all_parts.drop_duplicates().reset_index(drop=True)
# create a list of unique values
ethnicity_list = ethnicity_unique_series.tolist()
# create new columns for ethnicity
for ethnicity in ethnicity_list:
    new_df[f'{ethnicity}'] = result['ethnicity'].apply(lambda x: 1 if pd.notna(x) and ethnicity in x else 0) # 'yes' =1,'no' =0

# create new column for height in cm
new_df['height (cm)'] = result.height.apply(lambda x: round(x * 2.54, 0))

In [None]:
# create new column for income
new_df['job'] = result['job'].apply(lambda x: 'other' if 'rather not say' in x else x)

# create new columns for location
new_df[['city', 'state', 'country']] = result['location'].str.split(', ', n=2, expand=True)
# create new column for country
us_states = ["alabama", "alaska", "arizona", "arkansas", "california", "colorado", "connecticut",
                     "delaware", "florida", "georgia", "hawaii", "idaho", "illinois", "indiana", "iowa",
                     "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan",
                     "minnesota", "mississippi", "missouri", "montana", "nebraska", "nevada", "new hampshire",
                     "new jersey", "new mexico", "new york", "north carolina", "north dakota", "ohio", "oklahoma",
                     "oregon", "pennsylvania", "rhode island", "south carolina", "south dakota", "tennessee",
                     "texas", "utah", "vermont", "virginia", "washington", "west virginia", "wisconsin", "wyoming", 'district of columbia']
# create new column for country
new_df['country'] = new_df['state'].apply(lambda x : 'canada' if x == 'british columbia' else x if x not in us_states else 'united states')

In [None]:
# create new column for kids
new_df['has kids'] = result['offspring'].apply(lambda x: 'yes' if pd.notna(x) and ('has kids' in x or 'has a kid' in x)
                                       else 'no')
new_df['wants kids'] = result['offspring'].apply(lambda x: 'no' if pd.notna(x) and "doesn't want" in x
                                         else 'yes' if pd.notna(x) and ('want them' in x or 'wants them' in x or
                                                                               'want more' in x or 'wants more' in x or
                                                                                'want kids' in x or 'wants kids' in x)
                                                                                else 'neutral')

In [None]:
# create new column for pets
new_df['dogs lovers'] = result['pets'].apply(lambda x: 'yes' if pd.notna(x) and 'dislikes dogs' in x
                                     else 'no' if pd.notna(x) and ('likes dogs' in x or 'has dogs' in x)
                                     else 'neutral')
new_df['cats lovers'] = result['pets'].apply(lambda x: 'no' if pd.notna(x) and 'dislikes cats' in x
                                    else 'yes' if pd.notna(x) and ('likes cats' in x or 'has cats' in x)
                                    else 'neutral')
new_df['dogs owners'] = result['pets'].apply(lambda x: 'yes' if pd.notna(x) and 'has dogs' in x else 'no')
new_df['cats owners'] = result['pets'].apply(lambda x: 'yes' if pd.notna(x) and 'has cats' in x else 'no')

In [None]:
# create new column for sign
result[['sign_1', 'sign_attitude']] = result['sign'].str.split(' ', n=1, expand=True)
# create new column for religion
result[['religion_1', 'religion_attitude']] = result['religion'].str.split(' ', n=1, expand=True)
# drop attidude columns
result = result.drop(columns=['sign_attitude', 'religion_attitude'])
# add columns to new dataframe
new_df[['sign','religion']] = result[['sign_1', 'religion_1']]
# drop sign_1 and religion_1
result = result.drop(columns=['sign_1', 'religion_1'])

In [None]:
# create new column for smokes
new_df['smokes'] = result['smokes']

In [None]:
# create new column for speaks
speaks_split_series = pd.Series(result['speaks'].unique()).str.split(', ', expand=True)
# get all parts
speaks_all_parts = pd.Series(speaks_split_series.values.ravel()).dropna()
# get unique values
speaks_unique_series = speaks_all_parts.drop_duplicates().reset_index(drop=True)
# create a list of unique values
language_list = speaks_unique_series.tolist()
# create new columns for speaks
first_words = [lang.split()[0] for lang in language_list]
# remove duplicates
language_list= list(set(first_words))

# create new columns for speaks
for language in language_list:
    new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()

  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if pd.notna(x) and language in x else 0).copy()
  new_df[f'{language} speaks'] = result['speaks'].apply(lambda x: 1 if p

In [None]:
# create new column for essay_all
new_df['essay_all']= result['essay_all']

  new_df['essay_all']= result['essay_all'].copy()


## Check Columns

In [None]:
# checikng columns
new_df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'asian',
       ...
       'turkish speaks', 'mongolian speaks', 'persian speaks', 'hindi speaks',
       'norwegian speaks', 'italian speaks', 'estonian speaks',
       'hawaiian speaks', 'sardinian speaks', 'essay_all'],
      dtype='object', length=110)

# Save Grouped Dataset as .csv

In [None]:
# save new dataframe
new_df.to_csv('okcupid_profiles_output.csv', index=False)