In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df = pd.read_csv("updated_data.csv")
# 1. Select the continuous numerical features for clustering
# We will use Age, Anxiety, and Avoidance to find "Psychological Archetypes"
features = df[['age', 'anxiety', 'avoidance']]

# 2. Standardize the data (Crucial for distance-based algorithms like K-Means)
# This ensures Age (18-40) doesn't overpower Anxiety (1-7) in the math
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# 3. Apply K-Means Clustering
# We choose K=4 because psychology defines 4 main attachment styles
# (Secure, Anxious-Preoccupied, Dismissive-Avoidant, Fearful-Avoidant)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['cluster_id'] = kmeans.fit_predict(scaled_features)

print("K-Means Clustering Complete. Assigning Dating Personas...")

# Calculate cluster centers to see what each group represents
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
for i, center in enumerate(cluster_centers):
    print(f"Cluster {i}: Avg Age={center[0]:.1f}, Avg Anxiety={center[1]:.2f}, Avg Avoidance={center[2]:.2f}")

# 4. Apply PCA for Visualization
# We compress the 3 dimensions (Age, Anx, Avo) down to 2 dimensions so we can plot it
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_features)
df['pca_1'] = principal_components[:, 0]
df['pca_2'] = principal_components[:, 1]

# 5. Plot the Clusters
plt.figure(figsize=(10, 6))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
for i in range(4):
    cluster_data = df[df['cluster_id'] == i]
    plt.scatter(cluster_data['pca_1'], cluster_data['pca_2'],
                c=colors[i], label=f'Persona {i}', alpha=0.6, edgecolors='w', s=80)

plt.title('Unsupervised Learning: MatchMind User Segmentation (PCA)')
plt.xlabel('Principal Component 1 (Psychological Variance)')
plt.ylabel('Principal Component 2 (Demographic Variance)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# Optional insight printout
print(f"\nVariance explained by PCA: {sum(pca.explained_variance_ratio_) * 100:.1f}%")

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [3]:
df.info()
df.dropna()

<class 'pandas.DataFrame'>
RangeIndex: 1211 entries, 0 to 1210
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   user_id                  1200 non-null   float64
 1   name                     1211 non-null   str    
 2   age                      1200 non-null   float64
 3   gender                   1200 non-null   str    
 4   target_gender            1200 non-null   str    
 5   location                 1200 non-null   str    
 6   occupation               1200 non-null   str    
 7   anxiety                  1200 non-null   float64
 8   avoidance                1200 non-null   float64
 9   Lifestyle                1200 non-null   str    
 10  Arts & Creativity        1200 non-null   str    
 11  Music                    1200 non-null   str    
 12  Movies & Shows           1200 non-null   str    
 13  Intellectual & Learning  1200 non-null   str    
 14  Food & Drinks            1200 non-n

Unnamed: 0,user_id,name,age,gender,target_gender,location,occupation,anxiety,avoidance,Lifestyle,Arts & Creativity,Music,Movies & Shows,Intellectual & Learning,Food & Drinks,Sports & Outdoor,Gaming & Digital,Travel & Culture,Personality & Values,Relationship Intent
0,1.0,Nethmi Bandara,37.0,Female,Male,Gampaha,Construction Worker,4.09,6.57,"[""Night Owl"", ""Meditation""]","[""Filmmaking"", ""Graphic Design"", ""Poetry"", ""In...","[""Indie""]","[""Documentaries"", ""Horror"", ""Anime"", ""Sitcoms""]","[""Self-Improvement""]","[""Street Food"", ""Spicy Food"", ""Cooking"", ""Coff...","[""Surfing"", ""Football"", ""Adventure Sports""]","[""Web3"", ""Crypto""]","[""Museums"", ""Beaches"", ""Mountains"", ""Cultural ...","[""Atheist"", ""Feminist""]","[""Open Relationship""]"
1,2.0,Malith,32.0,Male,Female,Kandy,Accountant,3.96,4.88,"[""Traveling"", ""Vegan"", ""Digital Nomad""]","[""Interior Design""]","[""Singing"", ""K-Pop"", ""Rock""]","[""Horror"", ""Documentaries"", ""K-Dramas""]","[""Science"", ""Books & Reading"", ""Self-Improveme...","[""Street Food"", ""Spicy Food"", ""Craft Beer"", ""F...","[""Cricket"", ""Swimming"", ""Hiking"", ""Surfing""]","[""Console Gaming"", ""Board Games"", ""Mobile Gami...","[""Museums"", ""Languages""]","[""Feminist"", ""Family-Oriented"", ""Career-Focuse...","[""Marriage""]"
2,3.0,Isuru,25.0,Male,Female,Gampaha,Accountant,2.63,4.24,"[""Early Bird""]","[""Poetry""]","[""Hip-Hop"", ""EDM""]","[""Sci-Fi""]","[""Technology"", ""History"", ""Science""]","[""Coffee"", ""Cooking"", ""Baking"", ""Street Food""]","[""Surfing"", ""Cycling"", ""Swimming"", ""Football""]","[""Mobile Gaming"", ""Dungeons & Dragons""]","[""Beaches""]","[""Career-Focused"", ""Family-Oriented"", ""Spiritu...","[""Long-Term Relationship""]"
3,4.0,Hashan,25.0,Male,Female,Badulla,Accountant,4.67,1.37,"[""Pet Lover"", ""Meditation"", ""Vegan""]","[""Filmmaking""]","[""Rock"", ""EDM"", ""Singing""]","[""Horror"", ""Sci-Fi""]","[""History"", ""AI & Machine Learning"", ""Technolo...","[""Street Food""]","[""Basketball"", ""Hiking"", ""Surfing"", ""Cycling""]","[""VR"", ""Console Gaming""]","[""Cultural Festivals"", ""Backpacking"", ""Road Tr...","[""Environmentalist"", ""Politically Active""]","[""Open Relationship""]"
4,5.0,Supun,23.0,Male,Female,Kandy,Doctor,3.45,1.17,"[""Traveling"", ""Pet Lover""]","[""Filmmaking"", ""Interior Design"", ""DIY & Crafts""]","[""K-Pop"", ""Pop""]","[""Thriller"", ""Sitcoms"", ""Documentaries""]","[""Self-Improvement"", ""Psychology""]","[""Wine"", ""Baking"", ""Craft Beer""]","[""Hiking"", ""Basketball"", ""Surfing"", ""Camping""]","[""Board Games"", ""Console Gaming"", ""VR""]","[""Luxury Travel"", ""Museums"", ""Road Trips""]","[""Politically Active"", ""Spiritual""]","[""Casual Dating""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1196.0,Imesha,39.0,Male,Female,Colombo,Driver,2.37,1.84,"[""Vegan"", ""Night Owl""]","[""Writing"", ""Painting"", ""Filmmaking"", ""Interio...","[""Classical"", ""EDM"", ""Rock"", ""Pop""]","[""Horror"", ""Anime"", ""Documentaries""]","[""Psychology""]","[""Fine Dining"", ""Baking""]","[""Hiking"", ""Camping""]","[""Web3""]","[""Backpacking"", ""Cultural Festivals"", ""Road Tr...","[""Family-Oriented"", ""Spiritual"", ""Feminist"", ""...","[""Marriage""]"
1196,1197.0,Hashan,27.0,Male,Female,Colombo,Doctor,1.73,4.23,"[""Vegetarian"", ""Night Owl""]","[""Interior Design"", ""Graphic Design"", ""Poetry""...","[""Playing Instruments""]","[""Romance"", ""Documentaries"", ""Sci-Fi""]","[""Psychology"", ""Books & Reading"", ""AI & Machin...","[""Street Food"", ""Baking""]","[""Cycling""]","[""Console Gaming"", ""Crypto""]","[""Languages"", ""Mountains"", ""Beaches""]","[""Atheist"", ""Spiritual"", ""Family-Oriented"", ""C...","[""Casual Dating""]"
1197,1198.0,Sahan,33.0,Male,Female,Ratnapura,Doctor,4.29,2.50,"[""Early Bird"", ""Pet Lover""]","[""Poetry"", ""Painting""]","[""Singing""]","[""Sitcoms"", ""K-Dramas"", ""Romance"", ""Thriller""]","[""Philosophy""]","[""Spicy Food"", ""Cooking"", ""Fine Dining"", ""Baki...","[""Cricket"", ""Football"", ""Camping"", ""Basketball""]","[""Dungeons & Dragons"", ""eSports"", ""Board Games...","[""Mountains""]","[""Religious"", ""Family-Oriented"", ""Career-Focus...","[""Marriage""]"
1198,1199.0,Pasan Vithanage,32.0,Female,Male,Badulla,Business Owner,3.53,3.58,"[""Vegan"", ""Gym"", ""Vegetarian"", ""Fitness""]","[""Graphic Design"", ""Interior Design""]","[""EDM"", ""Hip-Hop"", ""Jazz""]","[""K-Dramas""]","[""Books & Reading""]","[""Fine Dining"", ""Street Food""]","[""Cricket""]","[""Mobile Gaming"", ""Board Games"", ""eSports""]","[""Museums"", ""Languages"", ""Road Trips""]","[""Family-Oriented"", ""Spiritual"", ""Career-Focus...","[""Long-Term Relationship""]"


In [11]:
df.isnull().sum()
df = df.dropna()

In [9]:
df.isnull().sum()

user_id                    11
age                        11
gender                     11
target_gender              11
location                   11
occupation                 11
anxiety                    11
avoidance                  11
Lifestyle                  11
Arts & Creativity          11
Music                      11
Movies & Shows             11
Intellectual & Learning    11
Food & Drinks              11
Sports & Outdoor           11
Gaming & Digital           11
Travel & Culture           11
Personality & Values       11
Relationship Intent        11
dtype: int64