In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import random

In [26]:
def generate_survey_responses(n_responses=500):
    """Generate synthetic employee survey responses."""
    
    templates = {
        'work_life_balance': [
            "I love having flexible work hours",
            "Working from home helps with family time",
            "Four-day work week is amazing",
            "Can attend personal appointments easily",
            "Early start times help avoid traffic",
            "PTO policy is very generous",
            "Mental health days are respected",
            "No commute saves lots of time",
            "Can have family lunch at home",
            "Flexible schedule helps with classes"
        ],
        'workplace_environment': [
            "Standing desk helps my back",
            "Break room is well equipped",
            "Quiet spaces for focus work",
            "Natural lighting is great",
            "Temperature control is perfect",
            "Ergonomic chairs are comfortable",
            "Outdoor meeting space is nice",
            "Meditation room is peaceful",
            "Dual monitors boost productivity",
            "Air quality is excellent"
        ],
        'health_wellbeing': [
            "On-site gym is convenient",
            "Yoga classes reduce stress",
            "Healthy food options available",
            "Wellness program motivates",
            "Mental health support helps",
            "Meditation app is useful",
            "Regular stretch breaks help",
            "Standing options available",
            "Team walks are energizing",
            "Health workshops informative"
        ],
        'team_collaboration': [
            "Team lunches build bonds",
            "Brainstorming is productive",
            "Meeting culture is positive",
            "Mentoring helps growth",
            "Cross-team work is engaging",
            "Chat tools work well",
            "Team activities are fun",
            "Office layout aids collaboration",
            "Feedback is constructive",
            "Remote connection is strong"
        ],
        'personal_growth': [
            "Learning budget is generous",
            "Internal workshops teach skills",
            "Certification support helps",
            "Leadership opportunities exist",
            "Peer learning is valuable",
            "Book club is interesting",
            "Conference access helps",
            "Job shadowing teaches lots",
            "Mentoring system works",
            "Tech talks are informative"
        ]
    }
    
    responses = []
    topics = []
    
    responses_per_topic = n_responses // len(templates)
    
    for topic, template_list in templates.items():
        for _ in range(responses_per_topic):
            base_response = random.choice(template_list)
            modifiers = ["Really like that", "Great that", "Happy that", 
                        "Nice that", "Good that"]
            
            if random.random() < 0.3:
                response = f"{random.choice(modifiers)} {base_response.lower()}"
            else:
                response = base_response
                
            responses.append(response)
            topics.append(topic)
    
    return pd.DataFrame({'response': responses, 'true_topic': topics})

# Generate data
df = generate_survey_responses(500)

# Create document-term matrix
vectorizer = CountVectorizer(
    max_features=1000,
    stop_words='english',
    max_df=0.95,
    min_df=2
)
doc_term_matrix = vectorizer.fit_transform(df['response'])

# Create and fit LDA model
lda_model = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    max_iter=20,
    learning_method='online'
)
lda_output = lda_model.fit_transform(doc_term_matrix)

# Function to display top words per topic
def print_topics(model, feature_names, n_top_words=5):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_indices = topic.argsort()[:-n_top_words-1:-1]
        topic_words = [(feature_names[i], topic[i]) for i in top_indices]
        topics[topic_idx] = topic_words
    return topics

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Get topics
topics = print_topics(lda_model, feature_names)

# Create visualization
def plot_topics(topics, n_words=5):
    fig = go.Figure()
    
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
    
    for topic_id, (color, (_, topic_words)) in enumerate(zip(colors, topics.items())):
        words, scores = zip(*topic_words)
        
        # Normalize scores
        scores = np.array(scores) / sum(scores)
        
        fig.add_trace(
            go.Bar(
                x=scores,
                y=words,
                orientation='h',
                name=f'Topic {topic_id}',
                marker_color=color,
                text=[f"{score:.3f}" for score in scores],
                textposition='auto',
            )
        )
    
    fig.update_layout(
        title="Topic Word Scores",
        barmode='group',
        height=400,
        width=800,
        showlegend=True,
        yaxis={'categoryorder':'total ascending'},
        plot_bgcolor='white'
    )
    
    fig.update_xaxes(
        title="Score",
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgrey'
    )
    
    return fig

# Create and show visualization
fig = plot_topics(topics)
fig.show()

# Print topic summaries
print("\nTopic Word Probabilities:")
print("=======================")
for topic_idx, topic_words in topics.items():
    print(f"\nTopic {topic_idx}:")
    for word, prob in topic_words:
        total_prob = sum(p for _, p in topic_words)
        print(f"{word:<20} {prob/total_prob:.3f}")

# Calculate topic distribution
doc_topics = lda_model.transform(doc_term_matrix)
dominant_topics = doc_topics.argmax(axis=1)
topic_distribution = np.bincount(dominant_topics, minlength=len(topics))
print("\nTopic Distribution:")
print("==================")
for topic_idx, count in enumerate(topic_distribution):
    print(f"Topic {topic_idx}: {count} documents")


Topic Word Probabilities:

Topic 0:
health               0.234
workshops            0.227
informative          0.206
good                 0.176
helps                0.157

Topic 1:
team                 0.405
activities           0.163
fun                  0.163
meditation           0.135
room                 0.133

Topic 2:
helps                0.324
mentoring            0.233
standing             0.172
works                0.138
home                 0.133

Topic 3:
work                 0.390
lots                 0.212
generous             0.176
shadowing            0.111
job                  0.111

Topic 4:
really               0.230
like                 0.230
great                0.183
classes              0.181
nice                 0.175

Topic Distribution:
Topic 0: 106 documents
Topic 1: 114 documents
Topic 2: 110 documents
Topic 3: 99 documents
Topic 4: 71 documents
