In [18]:
import numpy as np
import pandas as pd
import re
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Load & Preprocess the Dataset

In [7]:
# Load dataset
link = "okcupid_profiles.csv"
df = pd.read_csv(link)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

In [9]:
# Select the 10 sentence columns (adjust column names based on dataset)
text_columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']

In [10]:
# Fill missing responses with a default placeholder
df[text_columns] = df[text_columns].fillna("No response")

In [11]:
# Function to clean text (basic preprocessing)
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text.strip()

In [12]:
# Apply text cleaning
for col in text_columns:
    df[col] = df[col].apply(clean_text)

# Convert Sentences into Embeddings

In [19]:
# Load SBERT model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")


No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /sentence-transformers/all-MiniLM-L6-v2/resolve/main/adapter_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1002)')))"), '(Request ID: 02f5284f-d35a-4224-89f1-e62eaadd8b96)')

In [None]:
# Convert each response into an embedding (average over 10 responses per user)
def get_text_embedding(row):
    sentences = row[text_columns].tolist()  # Get all 10 responses
    embeddings = sbert_model.encode(sentences)  # Convert to embeddings
    return np.mean(embeddings, axis=0)  # Average embeddings for a single vector

In [10]:
# Apply embeddings to all profiles
df["text_embedding"] = df.apply(get_text_embedding, axis=1)

KeyboardInterrupt: 

In [None]:
df.head()

In [20]:
embedding = df.tail().apply(get_text_embedding, axis=1)

RuntimeError: Numpy is not available

In [16]:
embedding

59941    [0.005218632, -0.008908999, 0.04986269, 0.0340...
59942    [-0.022675073, -0.03234982, 0.024624024, -0.01...
59943    [0.031940974, -0.032913912, 0.032944195, -0.00...
59944    [-0.0034100313, -0.008673101, 0.014722077, 0.0...
59945    [0.015553972, -0.015196681, 0.026378458, 0.033...
dtype: object

In [None]:
embedding

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

In [19]:
df['essay1']

0        currently working as an international agent fo...
1        dedicating everyday to being an unbelievable b...
2        i make nerdy software for musicians artists an...
3                reading things written by old dead people
4                                work work work work  play
                               ...                        
59941    the happiest times have been when life came to...
59942    currently finishing school for film production...
59943    im a civil engineer who enjoys helping the cit...
59944    following my dreams you got a dream you gotta ...
59945    i work with elderly people psychotherapy and c...
Name: essay1, Length: 59946, dtype: object