In [7]:
!pip install faiss-cpu



In [8]:
# Check what columns you actually have
print("Available columns in dataset:")
print(df.columns.tolist())
print(f"\nTotal columns: {len(df.columns)}")

Available columns in dataset:
['post_id', 'timestamp', 'day_of_week', 'platform', 'user_id', 'location', 'language', 'text_content', 'hashtags', 'mentions', 'keywords', 'topic_category', 'sentiment_score', 'sentiment_label', 'emotion_type', 'toxicity_score', 'likes_count', 'shares_count', 'comments_count', 'impressions', 'engagement_rate', 'brand_name', 'product_name', 'campaign_name', 'campaign_phase', 'user_past_sentiment_avg', 'user_engagement_growth', 'buzz_change_rate', 'combined_text', 'engagement_label']

Total columns: 30


In [9]:
!pip install faiss-cpu # Install FAISS

# 1. Load processed data
import pandas as pd
import numpy as np

df = pd.read_csv('/content/social_media_processed.csv')
print(f"Loaded {len(df)} posts")

# 2. Generate embeddings
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all combined_text fields
print("Generating embeddings...")
embeddings = model.encode(
    df['combined_text'].tolist(),
    normalize_embeddings=True,  # L2 normalization for cosine similarity
    show_progress_bar=True,
    batch_size=32
)

# Save embeddings
np.save('/content/post_embeddings.npy', embeddings)
print(f"Embeddings shape: {embeddings.shape}")  # Should be (11997, 384)

# 3. Build FAISS index
import faiss

dimension = 384  # all-MiniLM-L6-v2 produces 384-dim vectors
index = faiss.IndexFlatIP(dimension)  # Inner Product = cosine similarity

# Add embeddings to index
index.add(embeddings.astype('float32'))

# Save FAISS index
faiss.write_index(index, '/content/posts_faiss.index')
print(f"FAISS index created with {index.ntotal} vectors")

Loaded 11997 posts
Generating embeddings...


Batches:   0%|          | 0/375 [00:00<?, ?it/s]

Embeddings shape: (11997, 384)
FAISS index created with 11997 vectors


In [10]:
# 4. Create metadata dictionary
import pickle

metadata = {}
for idx, row in df.iterrows():
    metadata[idx] = row.to_dict()

with open('/content/posts_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(f"✓ Metadata saved for {len(metadata)} posts")

✓ Metadata saved for 11997 posts
