In [1]:
import pandas as pd

# Load the CSV file
data_path = '/Users/arnav/Downloads/data.csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataframe to understand its structure
data.head()


Unnamed: 0,user_id,username,age,status,sex,orientation,drinks,drugs,height,job,...,smokes,language,new_languages,body_profile,education_level,dropped_out,bio,interests,other_interests,location_preference
0,fffe3100,Edith Lopez,27,single,f,gay,socially,never,66.0,medicine / health,...,no,"english (fluently), spanish (poorly), sign lan...",interested,athletic,4.0,no,bottom line i love life! i work hard and i lov...,sports,instruments,same state
1,fffe3200,Travis Young,26,single,m,gay,socially,never,68.0,other,...,no,"english (fluently), tagalog (okay), french (po...",interested,fit,3.0,no,"i'm a straightforward, genuine, fun loving (i'...",painting,instruments,anywhere
2,fffe3300,Agnes Smith,20,seeing someone,f,bisexual,socially,sometimes,69.0,other,...,sometimes,"english (fluently), sign language (poorly), fr...",interested,fit,2.0,no,mmmmm yummy tacosss. yoga is where it's at. i ...,instruments,dancing,same city
3,fffe3400,Salvador Klaver,27,single,m,bisexual,socially,sometimes,68.0,computer / hardware / software,...,no,english,not interested,average,3.0,no,i'm a stealth geek. that special mix of techni...,sketching,acting,same city
4,fffe3500,Elana Sewell,22,single,f,bisexual,often,sometimes,68.0,other,...,yes,english,not interested,average,2.0,yes,with the whisper of the wind i was weaved into...,craft,designing,same city


Preprocessing

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

# Identify columns by data type for preprocessing
categorical_cols = data.select_dtypes(include=['object', 'bool']).columns.drop(['user_id', 'username', 'bio'])
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing
preprocessed_data = preprocessor.fit_transform(data)

# Show the shape of the processed data to confirm transformation
preprocessed_data.shape


(2001, 789)

K-Means Clustering

In [3]:
from sklearn.cluster import KMeans

# Choose a reasonable starting point for the number of clusters
n_clusters = 10

# Initialize and fit the K-Means model
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(preprocessed_data)

# Assign each individual to a cluster
cluster_assignments = kmeans.labels_

# Add cluster assignments back to the original data for interpretation
data_with_clusters = data.copy()
data_with_clusters['Cluster'] = cluster_assignments

# Display the distribution of individuals across clusters
cluster_distribution = data_with_clusters['Cluster'].value_counts()

cluster_distribution




Cluster
8    269
3    269
5    265
2    256
4    194
0    177
9    161
6    147
1    137
7    126
Name: count, dtype: int64

Applying and testing the clusters

In [4]:
from sklearn.metrics.pairwise import euclidean_distances

# Example: Matching roommates in the largest cluster
largest_cluster_label = cluster_distribution.idxmax()
largest_cluster_data = data_with_clusters[data_with_clusters['Cluster'] == largest_cluster_label]

# Extract the preprocessed features for individuals in the largest cluster
largest_cluster_preprocessed_data = preprocessor.transform(largest_cluster_data)

# Compute pairwise distances within the cluster
distances = euclidean_distances(largest_cluster_preprocessed_data)

# Set the diagonal to a high value to avoid self-matching
np.fill_diagonal(distances, np.inf)

# Find the closest pair (minimal distance) for demonstration
min_distance_idx = np.unravel_index(np.argmin(distances, axis=None), distances.shape)
matched_individuals = largest_cluster_data.iloc[list(min_distance_idx)]

matched_individuals[['username', 'age', 'sex', 'orientation', 'drinks', 'smokes', 'interests', 'location_preference']]


Unnamed: 0,username,age,sex,orientation,drinks,smokes,interests,location_preference
218,Roger Setchell,27,m,straight,socially,no,photography,same city
728,Patrick Geddes,25,m,straight,socially,no,photography,same city


Alternative Options

Hierarchical Clustering

In [6]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Convert preprocessed data to dense format
preprocessed_data_dense = preprocessed_data.toarray() 

# Perform hierarchical clustering
Z = linkage(preprocessed_data_dense, 'ward')

# Determine the number of clusters, here I have chosen 't' based on dendrogram analysis
clusters = fcluster(Z, t=50, criterion='distance')

# Assign cluster labels to the original data for analysis
data['H_Cluster'] = clusters

hcluster_distribution = data['H_Cluster'].value_counts()

hcluster_distribution



H_Cluster
2    1053
1     948
Name: count, dtype: int64

Collaborative Filtering

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix (assuming preprocessed_data is already scaled)
similarity_matrix = cosine_similarity(preprocessed_data)

# For each user, find the most similar other user, excluding self-similarity
most_similar_users = similarity_matrix.argsort()[:, -2:-1]  # Second to last column contains the index of the most similar user

# Example: match for the first user
match_for_first_user = most_similar_users[0, 0]
print(f"User 0's best match is User {match_for_first_user}")


User 0's best match is User 1223


Dimensionality Reduction + Clustering

In [9]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Dimensionality Reduction
pca = PCA(n_components=50)  # Adjust n_components based on variance ratio or specific needs
reduced_data = pca.fit_transform(preprocessed_data_dense)

# Clustering on Reduced Data
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(reduced_data)

# Assign cluster labels to the original data for analysis
data['PCA_KMeans_Cluster'] = clusters

pcacluster_distribution = data['PCA_KMeans_Cluster'].value_counts()

pcacluster_distribution




PCA_KMeans_Cluster
1    249
8    247
3    220
4    209
0    193
5    190
2    185
9    172
7    171
6    165
Name: count, dtype: int64

In [None]:
###### TEST ONLY #######


import tensorflow as tf
from tensorflow.keras import layers, models

# Define the base network for feature extraction
def create_base_network(input_shape):
    input = layers.Input(shape=input_shape)
    x = layers.Dense(128, activation='relu')(input)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(64, activation='relu')(x)
    return models.Model(input, x)

input_shape = [preprocessed_data.shape[1]] 
base_network = create_base_network(input_shape)

# Create the inputs
input_a = layers.Input(shape=input_shape)
input_b = layers.Input(shape=input_shape)

# Because we re-use the same instance 'base_network',
# the weights of the network will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

# Compute the distance between the two vectors in embedding space
distance = layers.Lambda(lambda embeddings: tf.abs(embeddings[0] - embeddings[1]))([processed_a, processed_b])

# Add a dense layer with a sigmoid unit to generate the similarity score
prediction = layers.Dense(1, activation='sigmoid')(distance)

# Define the model
model = models.Model(inputs=[input_a, input_b], outputs=prediction)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# We don't have any labels so we cannot train this model on this dataset right now. 
# I only made this model for experimental purposes and this can be used in the future 
# if we have a dataset with labels later om.
