# Import Library

In [1]:
# data manipulation
import numpy as np
import pandas as pd
# show progress bar
from tqdm.auto import tqdm
# embedding
from sentence_transformers import SentenceTransformer

# Load the Dataset

In [2]:
# Load dataset from the output of "Data Cleanse.ipynb"
df = pd.read_csv("okcupid_profiles_cleaned.csv")

In [3]:
# show the first 5 rows
df.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,job,location,offspring,pets,religion,sign,smokes,speaks,essay_all
0,22.0,single,male,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",190.0,transportation,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english,about me: i would love to think that i was so...
1,35.0,single,male,straight,average,mostly other,often,sometimes,working on space camp,white,178.0,hospitality / travel,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (...",i am a chef: this is what that means. 1. i am ...
2,38.0,available,male,straight,thin,anything,socially,never,graduated from masters program,"black, native american, pacific islander, white",173.0,other,"san francisco, california",no kids and neutral to kids,has cats,irreligion,pisces but it doesn't matter,no,"english, french, c++","i'm not ashamed of much, but writing public te..."
3,23.0,single,male,straight,thin,vegetarian,socially,never,working on college/university,white,180.0,student,"berkeley, california",doesn't want kids,likes cats,irreligion,pisces,no,"english, german (poorly)","i work in a library and go to school. . .,read..."
4,29.0,single,male,straight,athletic,anything,socially,never,graduated from college/university,"asian, black, other",168.0,artistic / musical / writer,"san francisco, california",no kids and neutral to kids,likes dogs and likes cats,irreligion,aquarius,no,english,hey how's it going? currently vague on the pro...


# Generate Embeddings

## Transform Data

In [4]:
def generate_demographic_sentence(row):
    """
    Convert a user's demographic data into a long descriptive sentence.
    """
    # Fill missing values with placeholders
    sex = row['sex'] if pd.notna(row['sex']) else "a person"
    status = row['status'] if pd.notna(row['status']) else "an unspecified relationship status"
    orientation = row['orientation'] if pd.notna(row['orientation']) else "with an unspecified orientation"
    body_type = row['body_type'] if pd.notna(row['body_type']) else "an unspecified body type"
    education = row['education'] if pd.notna(row['education']) else "an unspecified education level"
    job = row['job'] if pd.notna(row['job']) else "no specific job mentioned"
    location = row['location'] if pd.notna(row['location']) else "an unspecified location"
    ethnicity = row['ethnicity'] if pd.notna(row['ethnicity']) else "an unspecified ethnicity"
    diet = row['diet'] if pd.notna(row['diet']) else "no specific diet preference"
    drinks = row['drinks'] if pd.notna(row['drinks']) else "an unspecified drinking habit"
    smokes = row['smokes'] if pd.notna(row['smokes']) else "an unspecified smoking habit"
    drugs = row['drugs'] if pd.notna(row['drugs']) else "an unspecified stance on drugs"
    pets = row['pets'] if pd.notna(row['pets']) else "no specific pet preference"
    religion = row['religion'] if pd.notna(row['religion']) else "no specific religion"
    sign = row['sign'] if pd.notna(row['sign']) else "no zodiac sign mentioned"
    speaks = row['speaks'] if pd.notna(row['speaks']) else "an unspecified language proficiency"

    # Construct the long descriptive sentence
    sentence = (f"{sex}, {status}, living in {location}, sexual orientation is {orientation}. "
                f"Has {body_type} body type and ethnicity is {ethnicity}. "
                f"Education level: {education}. industry: {job}. "
                f"Dietary preference: {diet}. Drinking habit: {drinks}. "
                f"Smoking habit: {smokes}. Drug use: {drugs}. "
                f"Pet preference: {pets}. Religion: {religion}. Zodiac sign: {sign}. "
                f"Speaks: {speaks}.")
    
    return sentence

# Generate demographic sentences for each user
def combine_demographics_essay(row):
    """
    Combine demographic data and essay data into a single sentence.
    """
    # Generate demographic sentence
    demographic_sentence = generate_demographic_sentence(row)
    # extract essay data and fill missing values with empty string
    essay = row['essay_all'] if pd.notna(row['essay_all']) else ""
    # Combine demographic sentence and essay data
    sentence = demographic_sentence + " " + essay
    return sentence

## Convert text to embedding

In [5]:
def generate_embeddings(df, model):
    """
    Generate sentence embeddings for all user profiles in the dataset.
    """
    # Combine demographic data and essay data into a single sentence
    sentences = df.apply(combine_demographics_essay, axis=1).tolist()

    # Generate sentence embeddings
    embeddings = model.encode(sentences, show_progress_bar=True)
    
    return embeddings

## Standarize Numeric Data

In [6]:
def standardize_numeric(df):
    """
    Standardize non-text features in the dataset.
    """
    # Create a copy of the dataframe
    df = df.copy()
    # Standardize numeric features for float64 and int64 data types
    for col in df.columns:
        if (df[col].dtype == 'float64') or (df[col].dtype == 'int64'):
            # Standardize the column by subtracting the median and dividing by the standard deviation
            df[col] = (df[col] - df[col].median()) / df[col].std()

    return df

## Combine All Process

In [7]:
def preprocess_data(df, model):
    """
    Preprocess the dataset by generating embeddings and standardizing non-text features.
    """
    # Generate embeddings
    embeddings = generate_embeddings(df, model)
    
    # Standardize non-text features
    non_text_features = df.select_dtypes(include=['float64', 'int64'])
    non_text_features_standardized = standardize_numeric(non_text_features)
    
    # Concatenate the embeddings and non-text features
    X = np.concatenate([embeddings, non_text_features_standardized], axis=1)
    return X

# Perform Embedding Generation

In [8]:
# Load the pre-trained model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
# Preprocess the dataset
preprocess_data = preprocess_data(df, sbert_model)

Batches:   0%|          | 0/1874 [00:00<?, ?it/s]

# Save preprocessed as .npy

In [10]:
# Save preprocessed data to a file.
np.save("okcupid_profiles_preprocessed.npy", preprocess_data)

In [11]:
# Check if the preprocessed data is saved correctly
(np.load("okcupid_profiles_preprocessed.npy") == preprocess_data).all()

np.True_