In [2]:
# imports / libraries used
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [27]:
# load data
train = pd.read_csv('All_Data_Gentrification.csv').iloc[:, 1:]
test = pd.read_csv('All_Gentrification_Test.csv').iloc[:, 1:]

In [28]:
X_train = train.iloc[:, 3:-1]
y_train = train.iloc[:, -1:]
X_test = test.iloc[:, 3:]

In [29]:
# scale numeric columns
train_numeric  = X_train.iloc[:, :-2]
test_numeric  = X_test.iloc[:, :-2]

scaler = StandardScaler()

scaled_train = scaler.fit_transform(train_numeric)
scaled_train = np.nan_to_num(scaled_train, nan=0.0)

scaled_test = scaler.fit_transform(test_numeric)
scaled_test = np.nan_to_num(scaled_test, nan=0.0)

In [30]:
train_text  = X_train.iloc[:, -2:]
test_text  = X_test.iloc[:, -2:]

In [31]:
# generate embeddings for text data
train_text  = X_train.iloc[:, -2:]
test_text  = X_test.iloc[:, -2:]

# generate embeddings for the data with Sentence-Bert since we have full sentences and need context
model = SentenceTransformer('all-MiniLM-L6-v2')

text_embeddings_train = {}

for col in train_text.columns:
    col_texts = train_text[col].astype(str).tolist()
    text_embeddings_train[col] = model.encode(col_texts)
    print(f"Generated embeddings for '{col}', shape: {text_embeddings_train[col].shape}")

# combine embeddings from both columns so both can be used in clustering
all_embeddings_train = np.hstack([text_embeddings_train['Vibe'], text_embeddings_train['Activities']])
print(f"Combined embeddings shape: {all_embeddings_train.shape}")

# standardize the embeddings
scaler = StandardScaler()
scaled_embeddings_train = scaler.fit_transform(all_embeddings_train)


text_embeddings_test = {}

for col in test_text.columns:
    col_texts = test_text[col].astype(str).tolist()
    text_embeddings_test[col] = model.encode(col_texts)
    print(f"Generated embeddings for '{col}', shape: {text_embeddings_test[col].shape}")

# combine embeddings from both columns so both can be used in clustering
all_embeddings_test = np.hstack([text_embeddings_test['Vibe'], text_embeddings_test['Activities']])
print(f"Combined embeddings shape: {all_embeddings_test.shape}")

# standardize the embeddings
scaler = StandardScaler()
scaled_embeddings_test = scaler.fit_transform(all_embeddings_test)

Generated embeddings for 'Vibe', shape: (1302, 384)
Generated embeddings for 'Activities', shape: (1302, 384)
Combined embeddings shape: (1302, 768)
Generated embeddings for 'Vibe', shape: (76, 384)
Generated embeddings for 'Activities', shape: (76, 384)
Combined embeddings shape: (76, 768)


In [40]:
X_train = pd.DataFrame(np.hstack([scaled_train, scaled_embeddings_train]))
X_test = pd.DataFrame(np.hstack([scaled_test, scaled_embeddings_test]))

X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")