# Word Permutation Analysis

This notebook analyzes how many valid English words can be formed from different permutations of 5-7 letters in the alphabet.

## Overview
- Load valid English words from CSV
- Generate permutations of letters
- Count valid words for each permutation
- Visualize distributions across different word lengths
- Compare 5-letter, 6-letter, and 7-letter word patterns

## Setup
Import required libraries and modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from word_anal.analyzer import WordPermutationAnalyzer
from word_anal.data_processing import DataProcessor
from word_anal.visualizations import VisualizationGenerator
from word_anal.kaggle_helper import get_dictionary_dataset

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

ModuleNotFoundError: No module named 'pandas'

## Download Dataset (Optional)
Download the Kaggle English dictionary dataset

**Note**: If you already have the dataset downloaded, skip this cell.

In [None]:
from word_anal.kaggle_helper import get_dictionary_dataset
import os

# Load credentials from environment variables (recommended)
# Or set them directly here for testing (DO NOT commit!)
KAGGLE_CREDENTIALS = {
    "username": os.getenv("KAGGLE_USERNAME", "YOUR_USERNAME"),
    "key": os.getenv("KAGGLE_KEY", "YOUR_API_KEY")
}

# Download the dataset (set force=True to re-download)
csv_path = get_dictionary_dataset(
    credentials=KAGGLE_CREDENTIALS,
    download_path="data",
    force=False
)

print(f"Dataset ready at: {csv_path}")

## Configuration
Set your parameters here

In [None]:
# Path to the downloaded CSV file
WORDS_CSV_PATH = "data/dict.csv"  # Kaggle dataset path
WORD_COLUMN = "word"  # Column name in the Kaggle dataset

# Analysis parameters
WORD_LENGTHS = [5, 6, 7]  # Which word lengths to analyze
SAMPLE_SIZE = 10000  # Number of permutations to sample per length (None for all)
ALPHABET_SUBSET = None  # Use subset of alphabet (e.g., "aeiou") or None for full alphabet

# Output
HTML_OUTPUT_PATH = "word_analysis_visualization.html"

## Load Words
Initialize the analyzer with your word list

In [None]:
analyzer = WordPermutationAnalyzer(
    words_csv_path=WORDS_CSV_PATH,
    word_column=WORD_COLUMN
)

print(f"Total valid words loaded: {len(analyzer.valid_words):,}")
print(f"\nWord count by length:")
for length in WORD_LENGTHS:
    count = len(analyzer.get_words_by_length(length))
    print(f"  {length}-letter words: {count:,}")

## Run Analysis
Analyze permutations for each word length

In [None]:
# Run the analysis
results = analyzer.compare_word_lengths(
    lengths=WORD_LENGTHS,
    sample_size=SAMPLE_SIZE,
    alphabet_subset=ALPHABET_SUBSET
)

## Process Data
Prepare data for visualization

In [None]:
# Initialize data processor
processor = DataProcessor()

# Add results for each word length
for word_length, df in results.items():
    processor.add_results(word_length, df)

# Display comparison statistics
comparison_df = processor.get_comparison_data()
print("\nComparison Statistics:")
display(comparison_df)

## Top and Bottom Permutations
See which letter combinations yield the most and fewest valid words

In [None]:
for word_length in WORD_LENGTHS:
    print(f"\n{'='*60}")
    print(f"{word_length}-LETTER WORDS")
    print(f"{'='*60}")
    
    print(f"\nTop 10 Permutations:")
    top = processor.get_top_permutations(word_length, n=10)
    display(top[['permutation', 'word_count', 'words']])
    
    print(f"\nBottom 10 Permutations:")
    bottom = processor.get_bottom_permutations(word_length, n=10)
    display(bottom[['permutation', 'word_count', 'words']])

## Static Visualizations (Matplotlib)
Quick visualizations using matplotlib and seaborn

In [None]:
# Distribution comparison
fig, axes = plt.subplots(1, len(WORD_LENGTHS), figsize=(15, 4))
if len(WORD_LENGTHS) == 1:
    axes = [axes]

for idx, word_length in enumerate(WORD_LENGTHS):
    df = results[word_length]
    axes[idx].hist(df['word_count'], bins=50, alpha=0.7, edgecolor='black')
    axes[idx].set_xlabel('Number of Valid Words')
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'{word_length}-Letter Word Distribution')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Box plot comparison
plt.figure(figsize=(10, 6))

data_for_boxplot = []
labels = []
for word_length in WORD_LENGTHS:
    data_for_boxplot.append(results[word_length]['word_count'])
    labels.append(f"{word_length}-letter")

plt.boxplot(data_for_boxplot, labels=labels)
plt.ylabel('Number of Valid Words per Permutation')
plt.xlabel('Word Length')
plt.title('Distribution Comparison: Valid Words per Permutation')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Violin plot
plt.figure(figsize=(12, 6))

combined_data = []
for word_length in WORD_LENGTHS:
    df_temp = results[word_length][['word_count']].copy()
    df_temp['word_length'] = f"{word_length}-letter"
    combined_data.append(df_temp)

combined_df = pd.concat(combined_data, ignore_index=True)
sns.violinplot(data=combined_df, x='word_length', y='word_count')
plt.ylabel('Number of Valid Words per Permutation')
plt.xlabel('Word Length')
plt.title('Distribution Shape Comparison')
plt.grid(True, alpha=0.3)
plt.show()

## Interactive D3.js Visualizations
Generate and display interactive visualizations

In [None]:
# Create visualization generator
viz_gen = VisualizationGenerator(processor)

# Generate HTML file
viz_gen.generate_html(output_path=HTML_OUTPUT_PATH)
print(f"\nInteractive visualization saved to: {HTML_OUTPUT_PATH}")
print("Open this file in a web browser to view the interactive D3.js visualizations.")

In [None]:
# Display in notebook
viz_gen.generate_notebook_display()

## Export Results
Save processed data to CSV files

In [None]:
# Export all results to CSV
processor.export_all_to_csv(output_dir=".")

# Export comparison statistics
comparison_df.to_csv("comparison_statistics.csv", index=False)
print("\nExported comparison statistics to: comparison_statistics.csv")

## Statistical Analysis
Deeper statistical insights

In [None]:
# Distribution characteristics
print("Distribution Characteristics:\n")

for word_length in WORD_LENGTHS:
    stats = processor.get_stats(word_length)
    print(f"{word_length}-letter words:")
    print(f"  Skewness: {stats.skewness:.3f}")
    print(f"  Kurtosis: {stats.kurtosis:.3f}")
    
    if stats.skewness > 0:
        print(f"  → Right-skewed distribution (tail extends right)")
    elif stats.skewness < 0:
        print(f"  → Left-skewed distribution (tail extends left)")
    else:
        print(f"  → Symmetric distribution")
    
    if stats.kurtosis > 0:
        print(f"  → Heavy tails (more outliers than normal distribution)")
    elif stats.kurtosis < 0:
        print(f"  → Light tails (fewer outliers than normal distribution)")
    print()

## Summary
Key findings and insights

In [None]:
print("ANALYSIS SUMMARY")
print("="*60)
print(f"\nWords analyzed: {len(analyzer.valid_words):,}")
print(f"Word lengths: {', '.join(map(str, WORD_LENGTHS))}")
print(f"Permutations per length: {SAMPLE_SIZE if SAMPLE_SIZE else 'All'}")

print("\nKey Findings:")
for word_length in WORD_LENGTHS:
    stats = processor.get_stats(word_length)
    print(f"\n{word_length}-letter words:")
    print(f"  Average valid words per permutation: {stats.mean:.2f}")
    print(f"  Range: {stats.min} to {stats.max}")
    print(f"  50% of permutations have between {stats.q25:.0f} and {stats.q75:.0f} valid words")

print("\n" + "="*60)
print("Analysis complete!")