In [None]:
# Install required packages (run this cell first if packages are not installed)
# !pip install pandas numpy matplotlib seaborn plotly dash scikit-learn nltk vaderSentiment wordcloud

# Core data manipulation and analysis
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# NLP and text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from wordcloud import WordCloud

# Interactive dashboard
import dash
from dash import dcc, html, Input, Output
import dash_bootstrap_components as dbc

# Warnings suppression for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Configure plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("All libraries imported successfully!")
print(f"Notebook initialized at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


In [None]:
def generate_synthetic_survey_data(n_responses=800):
    """
    Generate synthetic K-12 survey responses with realistic themes and sentiments.
    
    Returns:
        pd.DataFrame: Survey data with columns for response_text, respondent_type, 
                     grade_level, grade_group, and response_id
    """
    
    # Define response templates for different themes and sentiments
    student_responses = {
        'positive': [
            "I really enjoy using this platform! The interactive lessons make learning fun and engaging.",
            "The videos are super helpful and easy to understand. I can learn at my own pace.",
            "Love the colorful design and smooth animations. It makes studying feel like playing games.",
            "The quizzes are challenging but fair. I feel more confident about the material now.",
            "Great job on the mobile app! I can study anywhere and sync my progress perfectly.",
            "The homework help feature is amazing. I get instant feedback and explanations.",
            "My grades have improved since using this. The personalized learning path works great.",
            "The community features let me collaborate with classmates easily. Very social and fun!",
            "Loading is super fast and the interface is intuitive. No technical problems at all.",
            "The gamification elements motivate me to complete lessons. Badges and points are awesome!"
        ],
        'negative': [
            "The interface is way too confusing. I can't find anything and get lost constantly.",
            "The content moves too fast for me. I wish there were more practice problems.",
            "The app crashes frequently and I lose my progress. Very frustrating experience.",
            "The lessons are boring and repetitive. Not engaging at all for students my age.",
            "Navigation is terrible. Too many clicks to get to simple features.",
            "The explanations are unclear and hard to follow. Need better examples.",
            "Loading takes forever and the app freezes often. Makes me not want to use it.",
            "The design looks outdated and childish. Needs a modern refresh badly.",
            "Can't access content on mobile properly. Everything is tiny and hard to read.",
            "The pacing is off - either too easy or impossibly hard. No middle ground."
        ],
        'neutral': [
            "The platform has good content but the user interface could use some improvements.",
            "Some features work well while others need more development. Mixed experience overall.",
            "It's okay for basic learning but nothing special compared to other tools.",
            "The concept is good but execution could be better. Has potential.",
            "Works fine for homework but could be more engaging for long study sessions.",
            "Average experience. Gets the job done but doesn't stand out.",
            "Some lessons are great while others feel rushed. Inconsistent quality.",
            "The platform serves its purpose but lacks innovative features.",
            "Decent tool for studying but room for improvement in user experience.",
            "It's functional but could benefit from more interactive elements."
        ]
    }
    
    teacher_responses = {
        'positive': [
            "Excellent tool for classroom management and student engagement tracking.",
            "The analytics dashboard provides valuable insights into student performance patterns.",
            "Easy to assign and grade assignments. The automated feedback saves me hours.",
            "Great integration with our existing curriculum. Seamless workflow integration.",
            "Students are more engaged with lessons since we started using this platform.",
            "The professional development resources are comprehensive and well-organized.",
            "Parent communication features help keep families informed about student progress.",
            "Customizable lesson plans align perfectly with our district standards.",
            "The collaboration tools make group projects much easier to manage.",
            "Robust reporting features help me identify students who need extra support."
        ],
        'negative': [
            "The learning curve is too steep for busy teachers. Need better onboarding.",
            "Too many features make the interface cluttered and overwhelming to navigate.",
            "Limited customization options for different teaching styles and preferences.",
            "Student progress tracking is confusing and hard to interpret meaningfully.",
            "The platform doesn't integrate well with our school's existing systems.",
            "Frequent technical issues during class time disrupt lesson flow significantly.",
            "Lack of adequate training materials for teachers new to the platform.",
            "The grading system is inflexible and doesn't match our rubrics.",
            "Parent portal is confusing and generates too many support requests.",
            "Performance is slow with large class sizes. System can't handle the load."
        ],
        'neutral': [
            "The platform has useful features but requires significant time investment to master.",
            "Good foundation but needs more development in key areas like assessment tools.",
            "Meets basic needs but lacks advanced features found in competitor products.",
            "Adequate for simple tasks but struggles with more complex classroom scenarios.",
            "Some students adapt well while others find it challenging to use effectively.",
            "The platform works but doesn't significantly improve upon traditional methods.",
            "Mixed results - some features are excellent while others need work.",
            "Functional tool that accomplishes its goals with room for enhancement.",
            "Reasonable option but not necessarily better than what we used before.",
            "Standard educational technology platform with typical strengths and weaknesses."
        ]
    }
    
    # Grade level mapping
    grade_levels = list(range(13))  # K-12 (K=0, 1-12)
    grade_groups = {
        0: 'Elementary', 1: 'Elementary', 2: 'Elementary', 3: 'Elementary', 4: 'Elementary',
        5: 'Middle', 6: 'Middle', 7: 'Middle', 8: 'Middle',
        9: 'High', 10: 'High', 11: 'High', 12: 'High'
    }
    
    # Generate survey responses
    survey_data = []
    
    for i in range(n_responses):
        # Determine respondent type (60% students, 40% teachers)
        respondent_type = 'Student' if random.random() < 0.6 else 'Teacher'
        
        # Select appropriate response templates
        responses = student_responses if respondent_type == 'Student' else teacher_responses
        
        # Determine sentiment distribution (40% positive, 35% negative, 25% neutral)
        sentiment_prob = random.random()
        if sentiment_prob < 0.40:
            sentiment = 'positive'
        elif sentiment_prob < 0.75:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'
        
        # Select random response from appropriate category
        response_text = random.choice(responses[sentiment])
        
        # Assign grade level (weighted towards middle grades for more realistic distribution)
        if respondent_type == 'Student':
            grade_weights = [0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.12, 0.11, 0.08, 0.07, 0.06, 0.05]
            grade_level = np.random.choice(grade_levels, p=grade_weights)
        else:  # Teachers can be associated with any grade
            grade_level = random.choice(grade_levels)
        
        survey_data.append({
            'response_id': f'RESP_{i+1:04d}',
            'response_text': response_text,
            'respondent_type': respondent_type,
            'grade_level': f'Grade {grade_level}' if grade_level > 0 else 'Kindergarten',
            'grade_group': grade_groups[grade_level],
            'true_sentiment': sentiment  # For validation purposes
        })
    
    return pd.DataFrame(survey_data)

# Generate the survey dataset
print("Generating synthetic survey data...")
survey_df = generate_synthetic_survey_data(800)

# Display basic statistics
print(f"Generated {len(survey_df)} survey responses")
print(f"Respondent Distribution:")
print(survey_df['respondent_type'].value_counts())
print(f"\nGrade Group Distribution:")
print(survey_df['grade_group'].value_counts())
print(f"\nSentiment Distribution:")
print(survey_df['true_sentiment'].value_counts())

# Display first few rows
print(f"\nSample Responses:")
survey_df.head()


In [None]:
class SurveyNLPProcessor:
    """
    Comprehensive NLP processor for survey text analysis
    """
    
    def __init__(self):
        """Initialize the NLP processor with required components"""
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.sentiment_analyzer = SentimentIntensityAnalyzer()
        self.tfidf_vectorizer = None
        self.lda_model = None
        
    def preprocess_text(self, text):
        """
        Clean and preprocess text data
        
        Args:
            text (str): Raw text to preprocess
            
        Returns:
            str: Cleaned and preprocessed text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove punctuation and non-alphabetic tokens
        tokens = [token for token in tokens if token.isalpha()]
        
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        
        # Lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        
        # Join back to string
        return ' '.join(tokens)
    
    def analyze_sentiment(self, text):
        """
        Perform VADER sentiment analysis
        
        Args:
            text (str): Text to analyze
            
        Returns:
            dict: Sentiment scores (positive, negative, neutral, compound)
        """
        scores = self.sentiment_analyzer.polarity_scores(text)
        return {
            'positive': scores['pos'],
            'negative': scores['neg'],
            'neutral': scores['neu'],
            'compound': scores['compound']
        }
    
    def extract_keywords_tfidf(self, texts, max_features=100, ngram_range=(1, 2)):
        """
        Extract keywords using TF-IDF vectorization
        
        Args:
            texts (list): List of preprocessed texts
            max_features (int): Maximum number of features to extract
            ngram_range (tuple): Range of n-grams to consider
            
        Returns:
            tuple: (feature names, tfidf matrix)
        """
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,  # Ignore terms that appear in less than 2 documents
            max_df=0.8  # Ignore terms that appear in more than 80% of documents
        )
        
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        
        return feature_names, tfidf_matrix
    
    def get_top_keywords_by_group(self, df, text_col, group_col, top_n=10):
        """
        Extract top keywords for each group using TF-IDF
        
        Args:
            df (pd.DataFrame): DataFrame containing text and group columns
            text_col (str): Column name containing text data
            group_col (str): Column name containing group labels
            top_n (int): Number of top keywords to return per group
            
        Returns:
            dict: Dictionary with group names as keys and top keywords as values
        """
        group_keywords = {}
        
        for group in df[group_col].unique():
            group_texts = df[df[group_col] == group][text_col].tolist()
            
            # Create TF-IDF vectorizer for this group
            vectorizer = TfidfVectorizer(
                max_features=200,
                ngram_range=(1, 2),
                min_df=1,
                stop_words='english'
            )
            
            try:
                tfidf_matrix = vectorizer.fit_transform(group_texts)
                feature_names = vectorizer.get_feature_names_out()
                
                # Calculate mean TF-IDF scores
                mean_scores = np.mean(tfidf_matrix.toarray(), axis=0)
                
                # Get top keywords
                top_indices = np.argsort(mean_scores)[::-1][:top_n]
                top_keywords = [(feature_names[i], mean_scores[i]) for i in top_indices]
                
                group_keywords[group] = top_keywords
                
            except ValueError:
                # Handle case where group has insufficient data
                group_keywords[group] = []
                
        return group_keywords

# Initialize NLP processor
print("Initializing NLP processor...")
nlp_processor = SurveyNLPProcessor()

# Preprocess all survey responses
print("Preprocessing text data...")
survey_df['processed_text'] = survey_df['response_text'].apply(nlp_processor.preprocess_text)

# Perform sentiment analysis
print("Analyzing sentiment using VADER...")
sentiment_results = survey_df['response_text'].apply(nlp_processor.analyze_sentiment)
sentiment_df = pd.DataFrame(sentiment_results.tolist())
survey_df = pd.concat([survey_df, sentiment_df], axis=1)

# Classify sentiment based on compound score
def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

survey_df['predicted_sentiment'] = survey_df['compound'].apply(classify_sentiment)

# Extract top keywords by respondent type
print("Extracting top keywords by group...")
student_teacher_keywords = nlp_processor.get_top_keywords_by_group(
    survey_df, 'processed_text', 'respondent_type', top_n=15
)

grade_group_keywords = nlp_processor.get_top_keywords_by_group(
    survey_df, 'processed_text', 'grade_group', top_n=15
)

# Display results
print("NLP processing completed!")
print(f"\nSentiment Analysis Accuracy:")
accuracy = (survey_df['true_sentiment'].str.title() == survey_df['predicted_sentiment']).mean()
print(f"VADER Sentiment Classification Accuracy: {accuracy:.2%}")

print(f"\nPredicted Sentiment Distribution:")
print(survey_df['predicted_sentiment'].value_counts())

# Show sample of processed data
print(f"\nSample of Processed Data:")
display_cols = ['response_text', 'respondent_type', 'grade_group', 'predicted_sentiment', 'compound']
survey_df[display_cols].head()


In [None]:
# 4.1 Keyword Analysis Visualization

def plot_top_keywords(keywords_dict, title, top_n=10):
    """Create horizontal bar plot for top keywords"""
    fig, axes = plt.subplots(1, len(keywords_dict), figsize=(16, 8))
    if len(keywords_dict) == 1:
        axes = [axes]
    
    for i, (group, keywords) in enumerate(keywords_dict.items()):
        if keywords:  # Check if keywords exist for this group
            words, scores = zip(*keywords[:top_n])
            axes[i].barh(range(len(words)), scores, color=sns.color_palette("husl", len(keywords_dict))[i])
            axes[i].set_yticks(range(len(words)))
            axes[i].set_yticklabels(words)
            axes[i].set_xlabel('TF-IDF Score')
            axes[i].set_title(f'{group}')
            axes[i].invert_yaxis()
    
    plt.suptitle(title, fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

# Plot keywords by respondent type
print("Top Keywords by Respondent Type")
plot_top_keywords(student_teacher_keywords, "Top Keywords: Students vs Teachers", top_n=12)

# Plot keywords by grade group
print("\nTop Keywords by Grade Group")
plot_top_keywords(grade_group_keywords, "Top Keywords by Grade Group", top_n=10)


In [None]:
# 4.2 Sentiment Analysis Visualizations

# Create sentiment distribution plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Overall sentiment distribution
sentiment_counts = survey_df['predicted_sentiment'].value_counts()
colors = ['#2ecc71', '#e74c3c', '#95a5a6']  # Green, Red, Gray
axes[0,0].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', 
              colors=colors, startangle=90)
axes[0,0].set_title('Overall Sentiment Distribution', fontsize=14, fontweight='bold')

# 2. Sentiment by respondent type
sentiment_by_type = pd.crosstab(survey_df['respondent_type'], survey_df['predicted_sentiment'])
sentiment_by_type.plot(kind='bar', ax=axes[0,1], color=colors, rot=0)
axes[0,1].set_title('Sentiment by Respondent Type', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Respondent Type')
axes[0,1].set_ylabel('Count')
axes[0,1].legend(title='Sentiment')

# 3. Sentiment by grade group
sentiment_by_grade = pd.crosstab(survey_df['grade_group'], survey_df['predicted_sentiment'])
sentiment_by_grade.plot(kind='bar', ax=axes[1,0], color=colors, rot=0)
axes[1,0].set_title('Sentiment by Grade Group', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Grade Group')
axes[1,0].set_ylabel('Count')
axes[1,0].legend(title='Sentiment')

# 4. Sentiment score distribution
axes[1,1].hist(survey_df['compound'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[1,1].axvline(x=0.05, color='green', linestyle='--', label='Positive Threshold')
axes[1,1].axvline(x=-0.05, color='red', linestyle='--', label='Negative Threshold')
axes[1,1].set_title('Distribution of Compound Sentiment Scores', fontsize=14, fontweight='bold')
axes[1,1].set_xlabel('Compound Score')
axes[1,1].set_ylabel('Frequency')
axes[1,1].legend()

plt.tight_layout()
plt.show()

# Calculate sentiment statistics
print("Sentiment Analysis Summary:")
print(f"Average Sentiment Score: {survey_df['compound'].mean():.3f}")
print(f"Sentiment Standard Deviation: {survey_df['compound'].std():.3f}")

# Sentiment by respondent type analysis
print(f"\nAverage Sentiment by Respondent Type:")
sentiment_by_type_avg = survey_df.groupby('respondent_type')['compound'].agg(['mean', 'std', 'count'])
print(sentiment_by_type_avg.round(3))

print(f"\nAverage Sentiment by Grade Group:")
sentiment_by_grade_avg = survey_df.groupby('grade_group')['compound'].agg(['mean', 'std', 'count'])
print(sentiment_by_grade_avg.round(3))


In [None]:
# 4.3 Word Clouds and Advanced Visualizations

def create_wordcloud(text_data, title, colormap='viridis'):
    """Create and display word cloud"""
    text = ' '.join(text_data)
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        colormap=colormap,
        max_words=100,
        relative_scaling=0.5,
        random_state=42
    ).generate(text)
    
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title, fontsize=16, fontweight='bold')
    plt.axis('off')
    plt.show()

# Create word clouds for different groups
print("Word Cloud Visualizations")

# Word cloud for all responses
create_wordcloud(survey_df['processed_text'], "Word Cloud: All Survey Responses", 'plasma')

# Word clouds by respondent type
student_text = survey_df[survey_df['respondent_type'] == 'Student']['processed_text']
teacher_text = survey_df[survey_df['respondent_type'] == 'Teacher']['processed_text']

create_wordcloud(student_text, "Word Cloud: Student Responses", 'Blues')
create_wordcloud(teacher_text, "Word Cloud: Teacher Responses", 'Reds')

# Word clouds by sentiment
positive_text = survey_df[survey_df['predicted_sentiment'] == 'Positive']['processed_text']
negative_text = survey_df[survey_df['predicted_sentiment'] == 'Negative']['processed_text']

create_wordcloud(positive_text, "Word Cloud: Positive Feedback", 'Greens')
create_wordcloud(negative_text, "Word Cloud: Negative Feedback", 'Reds')


In [None]:
# 5.1 Interactive Plotly Dashboard Components

# Create interactive sentiment analysis dashboard
def create_interactive_dashboard():
    """Create comprehensive interactive dashboard"""
    
    # 1. Interactive Sentiment Distribution by Grade Level
    fig1 = px.sunburst(
        survey_df, 
        path=['grade_group', 'respondent_type', 'predicted_sentiment'],
        title="Interactive Sentiment Distribution: Grade Group → Respondent Type → Sentiment",
        color='compound',
        color_continuous_scale='RdYlGn',
        height=600
    )
    fig1.update_layout(title_font_size=16)
    fig1.show()
    
    # 2. Interactive Scatter Plot: Sentiment Scores
    fig2 = px.scatter(
        survey_df, 
        x='positive', 
        y='negative',
        size='neutral',
        color='predicted_sentiment',
        hover_data=['respondent_type', 'grade_group'],
        title="Sentiment Score Analysis: Positive vs Negative (Size = Neutral)",
        color_discrete_map={'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
    )
    fig2.update_layout(title_font_size=16, height=500)
    fig2.show()
    
    # 3. Interactive Heatmap: Sentiment by Grade and Type
    pivot_data = survey_df.pivot_table(
        values='compound', 
        index='grade_level', 
        columns='respondent_type', 
        aggfunc='mean'
    )
    
    fig3 = px.imshow(
        pivot_data,
        title="Sentiment Heatmap: Average Compound Score by Grade Level and Respondent Type",
        color_continuous_scale='RdYlGn',
        aspect="auto",
        height=600
    )
    fig3.update_layout(title_font_size=16)
    fig3.show()
    
    # 4. Interactive Box Plot: Sentiment Distribution
    fig4 = px.box(
        survey_df, 
        x='grade_group', 
        y='compound',
        color='respondent_type',
        title="Sentiment Score Distribution by Grade Group and Respondent Type",
        points="outliers"
    )
    fig4.update_layout(title_font_size=16, height=500)
    fig4.show()
    
    return fig1, fig2, fig3, fig4

print("Creating Interactive Dashboard...")
dashboard_figs = create_interactive_dashboard()


In [None]:
# 5.2 Advanced Keyword and Theme Analysis

def create_keyword_analysis_dashboard():
    """Create advanced keyword analysis visualizations"""
    
    # Prepare keyword data for visualization
    keyword_data = []
    
    # Extract keywords for each respondent type
    for resp_type in ['Student', 'Teacher']:
        if resp_type in student_teacher_keywords:
            for keyword, score in student_teacher_keywords[resp_type][:10]:
                keyword_data.append({
                    'keyword': keyword,
                    'score': score,
                    'respondent_type': resp_type,
                    'category': 'Respondent Type'
                })
    
    # Extract keywords for each grade group
    for grade_group in ['Elementary', 'Middle', 'High']:
        if grade_group in grade_group_keywords:
            for keyword, score in grade_group_keywords[grade_group][:8]:
                keyword_data.append({
                    'keyword': keyword,
                    'score': score,
                    'grade_group': grade_group,
                    'category': 'Grade Group'
                })
    
    keyword_df = pd.DataFrame(keyword_data)
    
    # 1. Interactive Keyword Comparison
    if not keyword_df.empty and 'respondent_type' in keyword_df.columns:
        resp_type_keywords = keyword_df[keyword_df['category'] == 'Respondent Type']
        if not resp_type_keywords.empty:
            fig1 = px.bar(
                resp_type_keywords,
                x='score',
                y='keyword',
                color='respondent_type',
                orientation='h',
                title="Top Keywords Comparison: Students vs Teachers",
                labels={'score': 'TF-IDF Score', 'keyword': 'Keywords'},
                height=600
            )
            fig1.update_layout(
                title_font_size=16,
                yaxis={'categoryorder': 'total ascending'}
            )
            fig1.show()
    
    # 2. Grade Group Keyword Analysis
    if 'grade_group' in keyword_df.columns:
        grade_keywords = keyword_df[keyword_df['category'] == 'Grade Group']
        if not grade_keywords.empty:
            fig2 = px.scatter(
                grade_keywords,
                x='grade_group',
                y='score',
                size='score',
                hover_name='keyword',
                title="Keyword Importance Across Grade Groups",
                labels={'score': 'TF-IDF Score', 'grade_group': 'Grade Group'},
                height=500
            )
            fig2.update_layout(title_font_size=16)
            fig2.show()
    
    # 3. Sentiment-based keyword analysis
    sentiment_keywords = {}
    for sentiment in ['Positive', 'Negative', 'Neutral']:
        sentiment_text = survey_df[survey_df['predicted_sentiment'] == sentiment]['processed_text']
        if len(sentiment_text) > 0:
            sentiment_keywords[sentiment] = nlp_processor.get_top_keywords_by_group(
                survey_df[survey_df['predicted_sentiment'] == sentiment], 
                'processed_text', 
                'predicted_sentiment', 
                top_n=10
            )
    
    # Create sentiment keyword visualization
    sentiment_keyword_data = []
    for sentiment, keywords_dict in sentiment_keywords.items():
        if sentiment in keywords_dict:
            for keyword, score in keywords_dict[sentiment][:8]:
                sentiment_keyword_data.append({
                    'keyword': keyword,
                    'score': score,
                    'sentiment': sentiment
                })
    
    if sentiment_keyword_data:
        sentiment_keyword_df = pd.DataFrame(sentiment_keyword_data)
        fig3 = px.treemap(
            sentiment_keyword_df,
            path=['sentiment', 'keyword'],
            values='score',
            title="Keyword Themes by Sentiment (Treemap)",
            color='score',
            color_continuous_scale='RdYlGn',
            height=600
        )
        fig3.update_layout(title_font_size=16)
        fig3.show()
    
    return keyword_df

print("Creating Advanced Keyword Analysis Dashboard...")
keyword_analysis_df = create_keyword_analysis_dashboard()


In [None]:
# 6.1 Pain Point Analysis and Insights Generation

def analyze_pain_points():
    """Analyze negative feedback to identify key pain points"""
    
    # Focus on negative feedback
    negative_feedback = survey_df[survey_df['predicted_sentiment'] == 'Negative']
    
    print("PAIN POINT ANALYSIS")
    print("=" * 50)
    
    # 1. Overall pain point distribution
    print(f"Negative Feedback Distribution:")
    print(f"• Total Negative Responses: {len(negative_feedback)}")
    print(f"• Students: {len(negative_feedback[negative_feedback['respondent_type'] == 'Student'])}")
    print(f"• Teachers: {len(negative_feedback[negative_feedback['respondent_type'] == 'Teacher'])}")
    
    # 2. Pain points by grade group
    print(f"\nNegative Sentiment by Grade Group:")
    negative_by_grade = negative_feedback['grade_group'].value_counts()
    for grade, count in negative_by_grade.items():
        percentage = (count / len(negative_feedback)) * 100
        print(f"• {grade}: {count} responses ({percentage:.1f}%)")
    
    # 3. Extract pain point keywords from negative feedback
    if len(negative_feedback) > 0:
        pain_point_keywords = nlp_processor.get_top_keywords_by_group(
            negative_feedback, 'processed_text', 'respondent_type', top_n=20
        )
        
        print(f"\nTop Pain Point Keywords:")
        for resp_type, keywords in pain_point_keywords.items():
            print(f"\n{resp_type} Pain Points:")
            for i, (keyword, score) in enumerate(keywords[:10], 1):
                print(f"  {i}. {keyword} (TF-IDF: {score:.3f})")
    
    # 4. Sentiment severity analysis
    print(f"\nSentiment Severity Analysis:")
    very_negative = negative_feedback[negative_feedback['compound'] <= -0.5]
    moderately_negative = negative_feedback[
        (negative_feedback['compound'] > -0.5) & (negative_feedback['compound'] <= -0.05)
    ]
    
    print(f"• Very Negative (≤ -0.5): {len(very_negative)} responses")
    print(f"• Moderately Negative (-0.5 to -0.05): {len(moderately_negative)} responses")
    
    return negative_feedback, pain_point_keywords

def generate_product_recommendations():
    """Generate specific product recommendations based on analysis"""
    
    print("\n\nPRODUCT TEAM RECOMMENDATIONS")
    print("=" * 50)
    
    # Calculate key metrics
    total_responses = len(survey_df)
    negative_rate = len(survey_df[survey_df['predicted_sentiment'] == 'Negative']) / total_responses
    student_avg_sentiment = survey_df[survey_df['respondent_type'] == 'Student']['compound'].mean()
    teacher_avg_sentiment = survey_df[survey_df['respondent_type'] == 'Teacher']['compound'].mean()
    
    print(f"KEY METRICS:")
    print(f"• Overall Negative Feedback Rate: {negative_rate:.1%}")
    print(f"• Student Average Sentiment: {student_avg_sentiment:.3f}")
    print(f"• Teacher Average Sentiment: {teacher_avg_sentiment:.3f}")
    print(f"• Sentiment Gap (Teacher - Student): {teacher_avg_sentiment - student_avg_sentiment:.3f}")
    
    print(f"\nPRIORITY RECOMMENDATIONS:")
    
    # Recommendation 1: UI/UX Improvements
    ui_keywords = ['interface', 'navigation', 'design', 'confusing', 'click']
    ui_mentions = survey_df[survey_df['processed_text'].str.contains('|'.join(ui_keywords), na=False)]
    
    print(f"\n1. UI/UX OVERHAUL (Priority: HIGH)")
    print(f"   • {len(ui_mentions)} responses mention UI/navigation issues")
    print(f"   • Focus on simplifying navigation and reducing cognitive load")
    print(f"   • Implement user-centered design principles")
    print(f"   • A/B test new interface designs with target grade groups")
    
    # Recommendation 2: Performance Issues
    perf_keywords = ['slow', 'loading', 'crash', 'freeze', 'performance']
    perf_mentions = survey_df[survey_df['processed_text'].str.contains('|'.join(perf_keywords), na=False)]
    
    print(f"\n2. PERFORMANCE OPTIMIZATION (Priority: HIGH)")
    print(f"   • {len(perf_mentions)} responses mention performance issues")
    print(f"   • Implement performance monitoring and optimization")
    print(f"   • Optimize for mobile devices and slower connections")
    print(f"   • Add loading indicators and offline capabilities")
    
    # Recommendation 3: Engagement Features
    engagement_keywords = ['boring', 'engaging', 'fun', 'interactive', 'motivation']
    engagement_mentions = survey_df[survey_df['processed_text'].str.contains('|'.join(engagement_keywords), na=False)]
    
    print(f"\n3. ENGAGEMENT ENHANCEMENT (Priority: MEDIUM)")
    print(f"   • {len(engagement_mentions)} responses mention engagement aspects")
    print(f"   • Expand gamification elements based on positive feedback")
    print(f"   • Add more interactive content and collaborative features")
    print(f"   • Personalize learning experiences by grade level")
    
    # Recommendation 4: Teacher-Specific Improvements
    if teacher_avg_sentiment < student_avg_sentiment:
        print(f"\n4. TEACHER EXPERIENCE FOCUS (Priority: MEDIUM)")
        print(f"   • Teachers show {abs(teacher_avg_sentiment - student_avg_sentiment):.3f} lower sentiment")
        print(f"   • Simplify administrative and grading workflows")
        print(f"   • Improve integration with existing school systems")
        print(f"   • Provide better onboarding and training resources")
    
    # Recommendation 5: Grade-Specific Optimizations
    grade_sentiment = survey_df.groupby('grade_group')['compound'].mean()
    lowest_sentiment_grade = grade_sentiment.idxmin()
    
    print(f"\n5. GRADE-SPECIFIC OPTIMIZATION (Priority: MEDIUM)")
    print(f"   • {lowest_sentiment_grade} grade group shows lowest sentiment ({grade_sentiment[lowest_sentiment_grade]:.3f})")
    print(f"   • Customize interface complexity for different age groups")
    print(f"   • Adapt content pacing and difficulty curves")
    print(f"   • Implement age-appropriate design patterns")
    
    return {
        'negative_rate': negative_rate,
        'student_sentiment': student_avg_sentiment,
        'teacher_sentiment': teacher_avg_sentiment,
        'lowest_grade_group': lowest_sentiment_grade,
        'ui_issues': len(ui_mentions),
        'performance_issues': len(perf_mentions),
        'engagement_issues': len(engagement_mentions)
    }

def create_executive_summary():
    """Create executive summary with key findings"""
    
    print(f"\n\nEXECUTIVE SUMMARY")
    print("=" * 50)
    
    total_responses = len(survey_df)
    sentiment_dist = survey_df['predicted_sentiment'].value_counts(normalize=True) * 100
    
    print(f"SURVEY OVERVIEW:")
    print(f"• Total Responses Analyzed: {total_responses}")
    print(f"• Students: {len(survey_df[survey_df['respondent_type'] == 'Student'])} ({len(survey_df[survey_df['respondent_type'] == 'Student'])/total_responses:.1%})")
    print(f"• Teachers: {len(survey_df[survey_df['respondent_type'] == 'Teacher'])} ({len(survey_df[survey_df['respondent_type'] == 'Teacher'])/total_responses:.1%})")
    
    print(f"\nKEY FINDINGS:")
    print(f"• Positive Sentiment: {sentiment_dist.get('Positive', 0):.1f}%")
    print(f"• Negative Sentiment: {sentiment_dist.get('Negative', 0):.1f}%")
    print(f"• Neutral Sentiment: {sentiment_dist.get('Neutral', 0):.1f}%")
    
    # Top positive and negative themes
    positive_responses = survey_df[survey_df['predicted_sentiment'] == 'Positive']['processed_text']
    negative_responses = survey_df[survey_df['predicted_sentiment'] == 'Negative']['processed_text']
    
    print(f"\nTOP POSITIVE THEMES:")
    print(f"• Interactive and engaging content")
    print(f"• Helpful video explanations and examples")
    print(f"• Gamification and motivational features")
    print(f"• Mobile accessibility and progress syncing")
    
    print(f"\nTOP NEGATIVE THEMES:")
    print(f"• Complex and confusing user interface")
    print(f"• Performance and loading issues")
    print(f"• Content pacing problems (too fast/slow)")
    print(f"• Limited customization and flexibility")
    
    print(f"\nIMMEDIATE ACTION ITEMS:")
    print(f"• Prioritize UI/UX simplification project")
    print(f"• Implement performance monitoring and optimization")
    print(f"• Develop grade-specific interface adaptations")
    print(f"• Enhance teacher onboarding and training programs")

# Run comprehensive analysis
negative_feedback, pain_points = analyze_pain_points()
recommendations = generate_product_recommendations()
create_executive_summary()


## 7. Interactive Dash Application

Below is a sample Dash application that could be deployed to provide real-time exploration of the survey data. This would allow stakeholders to filter and explore the data interactively in a web browser.


In [None]:
# 7.1 Sample Dash Application for Real-time Dashboard

def create_dash_app():
    """
    Create a Dash application for interactive survey data exploration
    Note: This is a demonstration - to run, uncomment the app.run_server() line
    """
    
    # Initialize Dash app
    app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
    
    # Define app layout
    app.layout = dbc.Container([
        dbc.Row([
            dbc.Col([
                html.H1("K-12 Survey NLP Insights Dashboard", className="text-center mb-4"),
                html.Hr()
            ])
        ]),
        
        dbc.Row([
            dbc.Col([
                html.Label("Select Respondent Type:"),
                dcc.Dropdown(
                    id='respondent-filter',
                    options=[
                        {'label': 'All', 'value': 'All'},
                        {'label': 'Student', 'value': 'Student'},
                        {'label': 'Teacher', 'value': 'Teacher'}
                    ],
                    value='All'
                )
            ], width=4),
            
            dbc.Col([
                html.Label("Select Grade Group:"),
                dcc.Dropdown(
                    id='grade-filter',
                    options=[
                        {'label': 'All', 'value': 'All'},
                        {'label': 'Elementary', 'value': 'Elementary'},
                        {'label': 'Middle', 'value': 'Middle'},
                        {'label': 'High', 'value': 'High'}
                    ],
                    value='All'
                )
            ], width=4),
            
            dbc.Col([
                html.Label("Select Sentiment:"),
                dcc.Dropdown(
                    id='sentiment-filter',
                    options=[
                        {'label': 'All', 'value': 'All'},
                        {'label': 'Positive', 'value': 'Positive'},
                        {'label': 'Negative', 'value': 'Negative'},
                        {'label': 'Neutral', 'value': 'Neutral'}
                    ],
                    value='All'
                )
            ], width=4)
        ], className="mb-4"),
        
        dbc.Row([
            dbc.Col([
                dcc.Graph(id='sentiment-distribution')
            ], width=6),
            dbc.Col([
                dcc.Graph(id='keyword-analysis')
            ], width=6)
        ]),
        
        dbc.Row([
            dbc.Col([
                dcc.Graph(id='sentiment-heatmap')
            ], width=12)
        ], className="mt-4")
    ], fluid=True)
    
    # Callback for updating charts based on filters
    @app.callback(
        [Output('sentiment-distribution', 'figure'),
         Output('keyword-analysis', 'figure'),
         Output('sentiment-heatmap', 'figure')],
        [Input('respondent-filter', 'value'),
         Input('grade-filter', 'value'),
         Input('sentiment-filter', 'value')]
    )
    def update_dashboard(respondent_type, grade_group, sentiment):
        # Filter data based on selections
        filtered_df = survey_df.copy()
        
        if respondent_type != 'All':
            filtered_df = filtered_df[filtered_df['respondent_type'] == respondent_type]
        if grade_group != 'All':
            filtered_df = filtered_df[filtered_df['grade_group'] == grade_group]
        if sentiment != 'All':
            filtered_df = filtered_df[filtered_df['predicted_sentiment'] == sentiment]
        
        # Create sentiment distribution chart
        sentiment_counts = filtered_df['predicted_sentiment'].value_counts()
        fig1 = px.pie(
            values=sentiment_counts.values,
            names=sentiment_counts.index,
            title="Sentiment Distribution"
        )
        
        # Create keyword frequency chart (simplified)
        word_freq = {}
        for text in filtered_df['processed_text']:
            words = text.split()
            for word in words[:5]:  # Top 5 words per response
                word_freq[word] = word_freq.get(word, 0) + 1
        
        top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
        if top_words:
            words, freqs = zip(*top_words)
            fig2 = px.bar(x=list(freqs), y=list(words), orientation='h', 
                         title="Top Keywords")
        else:
            fig2 = px.bar(title="No data available")
        
        # Create sentiment heatmap
        if len(filtered_df) > 0:
            heatmap_data = filtered_df.pivot_table(
                values='compound', 
                index='grade_group', 
                columns='respondent_type', 
                aggfunc='mean'
            )
            fig3 = px.imshow(heatmap_data, title="Average Sentiment by Group")
        else:
            fig3 = px.imshow([[0]], title="No data available")
        
        return fig1, fig2, fig3
    
    return app

# Create the app (but don't run it in notebook)
print("Creating Dash application...")
dash_app = create_dash_app()

print("Dash application created successfully!")
print("To run the dashboard, uncomment the following line:")
print("# dash_app.run_server(debug=True, port=8050)")

print("\nNote: The dashboard would be available at http://localhost:8050 when running.")
