<a href="https://colab.research.google.com/github/abbasi2021/embedding/blob/main/embedding_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.optimizers import Adam

In [None]:
data={"value":[300,23898,778,209],"year":[1399,1400,1403,1400]}

In [None]:
data = {
    'year': [1990, 2000, 2010, 1990, 2000, 2010],
    'feature1': [10.5, 20.3, 15.7, 10.1, 22.4, 17.3]
}
df = pd.DataFrame(data)

In [None]:
df

Unnamed: 0,year,feature1
0,1990,10.5
1,2000,20.3
2,2010,15.7
3,1990,10.1
4,2000,22.4
5,2010,17.3


In [None]:

# Step 2: Prepare Year Encoding via Embedding Layer
# First, label encode the year to be used in the embedding layer
le = LabelEncoder()
df['year_encoded'] = le.fit_transform(df['year'])

In [None]:


# Define the embedding layer: Let's say we want a 3-dimensional embedding for years
num_years = len(df['year_encoded'].unique())  # Number of unique years
embedding_dim = 3  # Embedding size (can be adjusted)

# Keras model to learn the year embeddings
input_year = Input(shape=(1,))
embedding_year = Embedding(input_dim=num_years, output_dim=embedding_dim)(input_year)
year_embedding_flat = Flatten()(embedding_year)

# Dummy model to extract embeddings (no real training needed for clustering)
model = Model(inputs=input_year, outputs=year_embedding_flat)
model.compile(optimizer=Adam(), loss='mse')

# Step 3: Generate the embeddings for each year
year_embeddings = model.predict(df['year_encoded'])

# Step 4: Concatenate the Year Embedding with Other Features (Feature1)
# Convert 'feature1' into a NumPy array and concatenate with year embeddings
feature1 = df['feature1'].values.reshape(-1, 1)

# Concatenate year embeddings with 'feature1'
concatenated_features = np.hstack((year_embeddings, feature1))

# Step 5: Scale the feature1 (numerical feature) before clustering
scaler = StandardScaler()
concatenated_features[:, -1] = scaler.fit_transform(concatenated_features[:, -1].reshape(-1, 1)).flatten()

# Step 6: Apply KMeans Clustering
kmeans = KMeans(n_clusters=2, random_state=0)
df['cluster'] = kmeans.fit_predict(concatenated_features)

# Output the results
print(df[['year', 'feature1', 'cluster']])
