In [None]:
print("="*80)
print("FEATURE EXTRACTION SUMMARY")
print("="*80)

comparison = pd.DataFrame({
    'Embedding Method': ['Bag-of-Words', 'TF-IDF', 'Word2Vec'],
    'Shape': [bow_features.shape, tfidf_features.shape, w2v_features.shape],
    'Type': [type(bow_features).__name__, type(tfidf_features).__name__, type(w2v_features).__name__],
    'Memory (MB)': [
        bow_features.data.nbytes / 1024 / 1024 if hasattr(bow_features, 'data') else 0,
        tfidf_features.data.nbytes / 1024 / 1024 if hasattr(tfidf_features, 'data') else 0,
        w2v_features.nbytes / 1024 / 1024
    ]
})

print(comparison.to_string(index=False))
print("\nKey Insights:")
print("- BoW: Simple, interpretable, sparse features")
print("- TF-IDF: Weighted frequency, reduces importance of common words")
print("- Word2Vec: Dense embeddings, captures semantic meaning, fixed dimension")
print("- BERT: Contextual embeddings, best semantic representation (GPU recommended)")

## Feature Comparison Summary

In [None]:
# Note: BERT extraction is computationally expensive
# Uncomment the code below to extract BERT embeddings (requires transformers and torch installed)

# print("Extracting BERT embeddings...")
# bert_extractor = BERTExtractor(model_name='bert-base-uncased', device='cpu')
# 
# # Extract embeddings in batches for efficiency
# bert_features = bert_extractor.transform(df['cleaned_text'].values[:1000], batch_size=32)
# 
# print(f"BERT Features Shape: {bert_features.shape}")
# print(f"Embedding dimension: {bert_features.shape[1]}")
# 
# joblib.dump(bert_extractor, r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\bert_extractor.pkl')
# print("BERT extractor saved!")

print("BERT extraction skipped in this notebook (requires GPU for efficient processing)")

## Feature Extraction: BERT (Optional - Requires GPU)

In [None]:
print("Extracting Word2Vec embeddings...")
# Tokenize for Word2Vec
tokenized_texts = [text.split() for text in df['cleaned_text'].values]

w2v_extractor = Word2VecExtractor(vector_size=300, window=5, min_count=2)
w2v_features = w2v_extractor.fit_transform(tokenized_texts)

print(f"Word2Vec Features Shape: {w2v_features.shape}")
print(f"Embedding dimension: {w2v_features.shape[1]}")

# Save Word2Vec extractor
joblib.dump(w2v_extractor, r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\w2v_extractor.pkl')
print("Word2Vec extractor saved!")

## Feature Extraction: Word2Vec

In [None]:
print("Extracting TF-IDF features...")
tfidf_extractor = TFIDFExtractor(max_features=5000)
tfidf_features = tfidf_extractor.fit_transform(df['cleaned_text'].values)

print(f"TF-IDF Features Shape: {tfidf_features.shape}")
print(f"Vocabulary size: {len(tfidf_extractor.feature_names)}")
print(f"Feature type: {type(tfidf_features)}")

# Save TF-IDF extractor
joblib.dump(tfidf_extractor, r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\tfidf_extractor.pkl')
print("TF-IDF extractor saved!")

## Feature Extraction: TF-IDF

In [None]:
print("Extracting Bag-of-Words features...")
bow_extractor = BagOfWordsExtractor(max_features=5000)
bow_features = bow_extractor.fit_transform(df['cleaned_text'].values)

print(f"BoW Features Shape: {bow_features.shape}")
print(f"Vocabulary size: {len(bow_extractor.feature_names)}")
print(f"Feature type: {type(bow_features)}")

# Save BoW extractor
joblib.dump(bow_extractor, r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\bow_extractor.pkl')
print("BoW extractor saved!")

## Feature Extraction: Bag-of-Words

In [None]:
data_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\data\preprocessed_reviews.csv'
df = pd.read_csv(data_path)

print(f"Dataset loaded: {len(df)} reviews")
print(f"Positive: {(df['Sentiment']==1).sum()}, Negative: {(df['Sentiment']==0).sum()}")
print(f"Cleaned text sample: {df['cleaned_text'].iloc[0][:100]}...")

## Load Preprocessed Data

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append(r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\src')

from feature_extraction import BagOfWordsExtractor, TFIDFExtractor, Word2VecExtractor, BERTExtractor
import joblib
import warnings
warnings.filterwarnings('ignore')

# 3. Feature Extraction (Text Embedding)
## Extracting Numerical Features from Reviews

In this notebook, we will:
- Load preprocessed reviews
- Extract Bag-of-Words (BoW) features
- Extract TF-IDF features
- Extract Word2Vec embeddings
- Extract BERT embeddings
- Compare and analyze different embedding techniques