## 6. User Interaction: Interactive Encoding Workshop

This section provides a comprehensive interactive environment to experiment with different text encoding techniques and parameters. You can:

- Try various encoding methods (Bag of Words, TF-IDF, One-Hot Encoding, Word Embeddings)
- Adjust preprocessing parameters
- Compare results across different encoding approaches
- Explore how changing parameters affects the resulting encodings
- Visualize the impact of your choices on dimensionality and information preservation

This hands-on approach helps build intuition about which encoding methods work best for different NLP tasks.

In [11]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import time

# Ensure NLTK data is downloaded
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

# Sample corpus for demonstrations
sample_corpus = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Machine learning algorithms can perform text classification tasks.",
    "Word embeddings capture semantic relationships between words.",
    "Text preprocessing is crucial for effective language processing.",
    "Deep learning models have revolutionized natural language understanding."
]

def preprocess_text(text, lowercase=True, remove_punctuation=True, 
                   remove_stopwords=False, lemmatize=False):
    """Preprocess text with configurable options"""
    if lowercase:
        text = text.lower()
    
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

def encode_text(texts, method="count", max_features=None, binary=False,
               ngram_range=(1,1), min_df=1, max_df=1.0, use_idf=True):
    """Encode text using different methods"""
    preprocessed_texts = [' '.join(preprocess_text(text, 
                                                  lowercase=preprocess_options['lowercase'],
                                                  remove_punctuation=preprocess_options['remove_punct'],
                                                  remove_stopwords=preprocess_options['remove_stopwords'],
                                                  lemmatize=preprocess_options['lemmatize'])) 
                          for text in texts]
    
    if method == "count":
        vectorizer = CountVectorizer(max_features=max_features,
                                    binary=binary,
                                    ngram_range=ngram_range,
                                    min_df=min_df,
                                    max_df=max_df)
        X = vectorizer.fit_transform(preprocessed_texts)
        feature_names = vectorizer.get_feature_names_out()
        
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(max_features=max_features,
                                    binary=binary,
                                    ngram_range=ngram_range,
                                    min_df=min_df,
                                    max_df=max_df,
                                    use_idf=use_idf)
        X = vectorizer.fit_transform(preprocessed_texts)
        feature_names = vectorizer.get_feature_names_out()
        
    elif method == "hashing":
        vectorizer = HashingVectorizer(n_features=max_features if max_features else 1024,
                                     binary=binary,
                                     ngram_range=ngram_range)
        X = vectorizer.fit_transform(preprocessed_texts)
        feature_names = [f"feature_{i}" for i in range(X.shape[1])]
        
    elif method == "onehot":
        # Simple one-hot encoding using CountVectorizer with binary=True
        vectorizer = CountVectorizer(max_features=max_features,
                                    binary=True,  # One-hot encoding
                                    ngram_range=ngram_range,
                                    min_df=min_df,
                                    max_df=max_df)
        X = vectorizer.fit_transform(preprocessed_texts)
        feature_names = vectorizer.get_feature_names_out()
        
    return X, feature_names, vectorizer

def visualize_encoding_results(X, feature_names, texts, method):
    """Visualize the encoding results"""
    # Create tabs for different visualizations
    tab = widgets.Tab()
    children = []
    
    # Tab 1: Matrix Visualization
    def create_matrix_viz():
        out = widgets.Output()
        with out:
            # Properly handle feature selection for display
            display_features = feature_names[:20] if len(feature_names) > 20 else feature_names
            
            if isinstance(X, np.ndarray):
                # For dense matrices, select columns corresponding to display_features
                if len(feature_names) > 20:
                    display_data = X[:, :20]
                else:
                    display_data = X
                df = pd.DataFrame(display_data, columns=display_features)
            else:
                # For sparse matrices
                if len(feature_names) > 20:
                    # Only take first 20 columns
                    display_data = X.tocsc()[:, :20]
                else:
                    display_data = X
                df = pd.DataFrame(display_data.toarray(), columns=display_features)
            
            # Set index for better display
            df.index = [f"Doc {i+1}" for i in range(len(texts))]
            
            plt.figure(figsize=(12, 8))
            sns.heatmap(df, annot=True, cmap="YlGnBu", fmt=".2f")
            plt.title(f"{method.upper()} Encoding Matrix" + 
                      (" (showing first 20 features)" if len(feature_names) > 20 else ""))
            plt.tight_layout()
            plt.show()
            
            # Display sparse statistics for sparse methods
            if hasattr(X, 'nnz'):  # Check if it's a sparse matrix
                non_zeros = X.nnz
                total_elements = X.shape[0] * X.shape[1]
                sparsity = 100 * (1 - non_zeros / total_elements)
                display(HTML(f"<div style='margin-top:10px'><b>Sparsity:</b> {sparsity:.2f}% of elements are zero</div>"))
                display(HTML(f"<div><b>Non-zero elements:</b> {non_zeros} out of {total_elements}</div>"))
            
        return out
    
    # Tab 2: Document Similarity
    def create_similarity_viz():
        out = widgets.Output()
        with out:
            # Compute similarity matrix
            if isinstance(X, np.ndarray):
                sim_matrix = cosine_similarity(X)
            else:
                sim_matrix = cosine_similarity(X)  # sklearn handles sparse matrices
            
            plt.figure(figsize=(10, 8))
            sns.heatmap(sim_matrix, annot=True, cmap="YlGnBu", fmt=".2f", 
                        xticklabels=[f"Doc {i+1}" for i in range(len(texts))], 
                        yticklabels=[f"Doc {i+1}" for i in range(len(texts))])
            plt.title(f"Document Similarity Matrix ({method.upper()} Encoding)")
            plt.tight_layout()
            plt.show()
            
            # Most similar document pairs
            if len(texts) > 1:  # Only if we have multiple documents
                sim_matrix_copy = sim_matrix.copy()
                np.fill_diagonal(sim_matrix_copy, -1)  # Remove self-similarity
                most_similar_idx = np.unravel_index(np.argmax(sim_matrix_copy), sim_matrix_copy.shape)
                display(HTML(f"<div style='margin-top:10px'><b>Most similar documents:</b> Doc {most_similar_idx[0]+1} and Doc {most_similar_idx[1]+1} (similarity: {sim_matrix_copy[most_similar_idx]:.2f})</div>"))
                display(HTML(f"<div style='margin-top:5px'><b>Doc {most_similar_idx[0]+1}:</b> {texts[most_similar_idx[0]]}</div>"))
                display(HTML(f"<div style='margin-top:5px'><b>Doc {most_similar_idx[1]+1}:</b> {texts[most_similar_idx[1]]}</div>"))
        
        return out
    
    # Tab 3: Dimension Reduction (TSNE) - UPDATED VERSION
    def create_tsne_viz():
        out = widgets.Output()
        with out:
            # Perform t-SNE dimension reduction
            if X.shape[0] > 2:  # Need at least 3 documents
                try:
                    # Calculate appropriate perplexity (should be smaller than n_samples)
                    # A good rule of thumb is perplexity = min(30, n_samples/3)
                    n_samples = X.shape[0]
                    perplexity = min(30, max(5, n_samples/5))
                    
                    if isinstance(X, np.ndarray):
                        X_embedded = TSNE(n_components=2, random_state=42, perplexity=perplexity).fit_transform(X)
                    else:
                        X_embedded = TSNE(n_components=2, random_state=42, perplexity=perplexity).fit_transform(X.toarray())
                    
                    plt.figure(figsize=(10, 8))
                    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], s=100)
                    
                    # Add document labels
                    for i, (x, y) in enumerate(X_embedded):
                        plt.annotate(f"Doc {i+1}", (x, y), fontsize=12, 
                                    alpha=0.8, xytext=(5, 5), textcoords='offset points')
                    
                    plt.title(f"t-SNE Visualization of Document Vectors ({method.upper()} Encoding)")
                    plt.tight_layout()
                    plt.show()
                    
                    # Explain the visualization
                    display(HTML("<div style='margin-top:10px'><b>Interpretation:</b> Documents that are semantically similar are positioned closer together in this 2D representation.</div>"))
                    display(HTML(f"<div><b>Note:</b> Using perplexity={perplexity:.1f} for {n_samples} documents.</div>"))
                except Exception as e:
                    display(HTML(f"<div style='color:red'>Could not create t-SNE visualization: {str(e)}</div>"))
                    display(HTML("<div><b>Explanation:</b> t-SNE requires at least 3 documents with a perplexity value less than the number of documents.</div>"))
            elif X.shape[0] == 2:
                # With just 2 documents, we can simply show their similarity without t-SNE
                display(HTML("<div>t-SNE not necessary for only 2 documents. Displaying simple comparison:</div>"))
                
                # Show similarity between the two documents
                if isinstance(X, np.ndarray):
                    sim = cosine_similarity(X)[0, 1]
                else:
                    sim = cosine_similarity(X)[0, 1]
                    
                plt.figure(figsize=(6, 4))
                plt.bar(['Document Similarity'], [sim])
                plt.ylim(0, 1)
                plt.title("Cosine Similarity Between Documents")
                plt.ylabel("Similarity Score")
                plt.show()
            else:
                display(HTML("<div>t-SNE visualization requires at least 2 documents.</div>"))
        
        return out
    
    # Tab 4: Feature Importance (for sparse encodings)
    def create_feature_importance():
        out = widgets.Output()
        with out:
            if method in ["count", "tfidf", "onehot"]:
                if isinstance(X, np.ndarray):
                    feature_sums = X.sum(axis=0)
                    if isinstance(feature_sums, np.matrix):
                        feature_sums = feature_sums.A1  # Convert matrix to array
                else:
                    feature_sums = X.sum(axis=0).A1  # Convert to 1D array
                
                # Create dataframe with feature importance
                importance_df = pd.DataFrame({
                    'Feature': feature_names,
                    'Importance': feature_sums
                })
                
                # Get top 15 features by importance
                top_features = importance_df.sort_values('Importance', ascending=False).head(15)
                
                plt.figure(figsize=(12, 6))
                bars = sns.barplot(x='Importance', y='Feature', data=top_features)
                
                # Add value labels to bars
                for i, v in enumerate(top_features['Importance']):
                    bars.text(v + 0.1, i, f"{v:.2f}", va='center')
                    
                plt.title(f"Top 15 Most Important Features ({method.upper()} Encoding)")
                plt.tight_layout()
                plt.show()
                
                # Explain what importance means for each method
                if method == 'count':
                    display(HTML("<div style='margin-top:10px'><b>Importance in BoW:</b> Total count of each term across all documents. Higher values indicate more frequent terms.</div>"))
                elif method == 'tfidf':
                    display(HTML("<div style='margin-top:10px'><b>Importance in TF-IDF:</b> Sum of TF-IDF scores for each term. Higher values indicate terms that are both frequent in some documents and discriminative across the corpus.</div>"))
                elif method == 'onehot':
                    display(HTML("<div style='margin-top:10px'><b>Importance in One-Hot:</b> Number of documents containing each term. Higher values indicate more widespread terms.</div>"))
            elif method == 'hashing':
                display(HTML("<div>Feature importance visualization not applicable for hashing vectorizer (feature names are not interpretable).</div>"))
        
        return out
    
    # Add tabs
    children.append(create_matrix_viz())
    children.append(create_similarity_viz())
    children.append(create_tsne_viz())
    children.append(create_feature_importance())
    
    tab.children = children
    tab.set_title(0, "Encoding Matrix")
    tab.set_title(1, "Document Similarity")
    tab.set_title(2, "t-SNE Visualization")
    tab.set_title(3, "Feature Importance")
    
    display(tab)

# Global dict to store preprocessing options
preprocess_options = {
    'lowercase': True,
    'remove_punct': True,
    'remove_stopwords': False,
    'lemmatize': False
}

def interactive_encoding_workshop():
    """Main function for the interactive encoding workshop"""
    # Text input
    text_area = widgets.Textarea(
        value='\n'.join(sample_corpus),
        placeholder='Enter text documents, one per line',
        description='Documents:',
        layout=widgets.Layout(width='100%', height='150px')
    )
    
    # Pre-processing options
    lowercase = widgets.Checkbox(value=True, description='Lowercase')
    remove_punct = widgets.Checkbox(value=True, description='Remove punctuation')
    remove_stopwords = widgets.Checkbox(value=False, description='Remove stopwords')
    lemmatize = widgets.Checkbox(value=False, description='Lemmatize')
    
    preprocess_box = widgets.VBox([
        widgets.HTML("<b>Text Preprocessing Options:</b>"),
        widgets.HBox([lowercase, remove_punct, remove_stopwords, lemmatize])
    ])
    
    # Encoding method selection
    method = widgets.Dropdown(
        options=[
            ('Bag of Words (Count)', 'count'),
            ('TF-IDF', 'tfidf'),
            ('One-Hot Encoding', 'onehot'),
            ('Hashing Vectorizer', 'hashing')
        ],
        value='count',
        description='Method:'
    )
    
    # Advanced options for encoding methods
    max_features = widgets.IntSlider(
        value=100,
        min=10,
        max=1000,
        step=10,
        description='Max Features:',
        disabled=False,
        continuous_update=False
    )
    
    binary = widgets.Checkbox(
        value=False,
        description='Binary (0/1)',
        disabled=False
    )
    
    ngram_min = widgets.IntSlider(
        value=1,
        min=1,
        max=3,
        step=1,
        description='N-gram Min:',
        disabled=False,
        continuous_update=False
    )
    
    ngram_max = widgets.IntSlider(
        value=1,
        min=1,
        max=3,
        step=1,
        description='N-gram Max:',
        disabled=False,
        continuous_update=False
    )
    
    use_idf = widgets.Checkbox(
        value=True,
        description='Use IDF (for TF-IDF)',
        disabled=False
    )
    
    advanced_options = widgets.VBox([
        widgets.HTML("<b>Advanced Encoding Options:</b>"),
        widgets.HBox([max_features, binary]),
        widgets.HBox([ngram_min, ngram_max, use_idf])
    ])
    
    # Output area for results
    output = widgets.Output()
    
    # Button to run the encoding
    run_button = widgets.Button(
        description='Run Encoding',
        button_style='success',
        tooltip='Click to run the encoding with selected options'
    )
    
    # Function to update preprocessing options
    def update_preprocess_options(*args):
        preprocess_options['lowercase'] = lowercase.value
        preprocess_options['remove_punct'] = remove_punct.value
        preprocess_options['remove_stopwords'] = remove_stopwords.value
        preprocess_options['lemmatize'] = lemmatize.value
    
    # Connect preprocessing widgets to the update function
    lowercase.observe(update_preprocess_options, names='value')
    remove_punct.observe(update_preprocess_options, names='value')
    remove_stopwords.observe(update_preprocess_options, names='value')
    lemmatize.observe(update_preprocess_options, names='value')
    
    # Update ngram_max min value based on ngram_min
    def update_ngram_max(*args):
        ngram_max.min = ngram_min.value
        if ngram_max.value < ngram_min.value:
            ngram_max.value = ngram_min.value
    
    ngram_min.observe(update_ngram_max, names='value')
    
    # Function to handle method changes
    def on_method_change(change):
        # If one-hot encoding is selected, force binary to True and disable
        if change.new == 'onehot':
            binary.value = True
            binary.disabled = True
        else:
            binary.disabled = False
            # Only for count vectorizer we default to False
            if change.new == 'count':
                binary.value = False
                
        # Use IDF only relevant for TF-IDF
        use_idf.disabled = change.new != 'tfidf'
    
    method.observe(on_method_change, names='value')
    
    # Button click event handler
    def on_run_button_click(b):
        with output:
            clear_output()
            
            # Get input texts
            texts = text_area.value.strip().split('\n')
            if not texts or all(not t.strip() for t in texts):
                display(HTML("<div style='color:red'>Please enter at least one document.</div>"))
                return
            
            # Update preprocessing options
            update_preprocess_options()
            
            # Display a progress message
            display(HTML("<div>Processing... Please wait.</div>"))
            
            try:
                # Time the encoding process
                start_time = time.time()
                
                # Perform encoding
                X, feature_names, vectorizer = encode_text(
                    texts, 
                    method=method.value,
                    max_features=max_features.value if max_features.value > 0 else None,
                    binary=binary.value,
                    ngram_range=(ngram_min.value, ngram_max.value),
                    use_idf=use_idf.value
                )
                
                end_time = time.time()
                
                # Clear the progress message
                clear_output()
                
                # Display statistics
                method_label = dict(method.options).get(method.value, method.value.upper())
                display(HTML(f"<h3>Encoding Results: {method_label}</h3>"))
                display(HTML(f"<div><b>Number of documents:</b> {len(texts)}</div>"))
                display(HTML(f"<div><b>Number of features:</b> {len(feature_names)}</div>"))
                display(HTML(f"<div><b>Matrix shape:</b> {X.shape[0]} documents × {X.shape[1]} features</div>"))
                display(HTML(f"<div><b>Processing time:</b> {end_time - start_time:.4f} seconds</div>"))
                
                # Display encoding method explanation
                if method.value == 'count':
                    display(HTML("<div style='margin-top:10px'><b>Bag of Words:</b> Counts word occurrences in each document. Simple but loses word order and semantic meaning.</div>"))
                elif method.value == 'tfidf':
                    display(HTML("<div style='margin-top:10px'><b>TF-IDF:</b> Weights words based on frequency in document and rarity across corpus. Highlights distinctive terms.</div>"))
                elif method.value == 'onehot':
                    display(HTML("<div style='margin-top:10px'><b>One-Hot Encoding:</b> Binary representation (0/1) indicating word presence. Loses frequency information but simple to interpret.</div>"))
                elif method.value == 'hashing':
                    display(HTML("<div style='margin-top:10px'><b>Hashing Vectorizer:</b> Uses a hash function to map terms to indices. Memory efficient but feature names are not interpretable.</div>"))
                
                # Display visualizations
                visualize_encoding_results(X, feature_names, texts, method.value)
                
            except Exception as e:
                clear_output()
                display(HTML(f"<div style='color:red'>Error: {str(e)}</div>"))
                import traceback
                traceback.print_exc()
    
    run_button.on_click(on_run_button_click)
    
    # Layout
    display(text_area)
    display(preprocess_box)
    display(widgets.HBox([method]))
    display(advanced_options)
    display(run_button)
    display(output)

# Run the workshop
interactive_encoding_workshop()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samarmohanty/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Textarea(value='Natural language processing is a subfield of artificial intelligence.\nMachine learning algori…

VBox(children=(HTML(value='<b>Text Preprocessing Options:</b>'), HBox(children=(Checkbox(value=True, descripti…

HBox(children=(Dropdown(description='Method:', options=(('Bag of Words (Count)', 'count'), ('TF-IDF', 'tfidf')…

VBox(children=(HTML(value='<b>Advanced Encoding Options:</b>'), HBox(children=(IntSlider(value=100, continuous…

Button(button_style='success', description='Run Encoding', style=ButtonStyle(), tooltip='Click to run the enco…

Output()