From 1b0302e137d7aa145428dbd23ba4fbd78dee8819 Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Tue, 14 Oct 2025 21:12:37 -0700 Subject: [PATCH 1/2] Add llm instructions --- CHANGELOG.md | 4 + Cargo.toml | 1 + journal/add_language_support.md | 360 ++++++++++++++++++++++++++++++++ 3 files changed, 365 insertions(+) create mode 100644 journal/add_language_support.md diff --git a/CHANGELOG.md b/CHANGELOG.md index f3501b3..9a32f2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +[0.3.14] + +- Add Zig support + [0.3.13] - Add server version to logs on startup diff --git a/Cargo.toml b/Cargo.toml index 81ba5db..132eee1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -62,6 +62,7 @@ tree-sitter-rust = "<0.25.0" tree-sitter-toml-ng = "<0.8.0" tree-sitter-typescript = "0.23.2" tree-sitter-zig = "<2" +codebook-tree-sitter-latex = "<0.7.0" unicode-segmentation = "1.12.0" url = "2.4.0" walkdir = "2.3.3" diff --git a/journal/add_language_support.md b/journal/add_language_support.md new file mode 100644 index 0000000..41fd43e --- /dev/null +++ b/journal/add_language_support.md @@ -0,0 +1,360 @@ +# Adding New Language Support to Codebook + +## LLM Guide for Adding Programming Language Support + +This document provides a systematic approach for adding new programming language support to Codebook. Follow these steps in order. + +## Prerequisites + +- Tree-sitter grammar package name and version for the target language +- Access to the language's tree-sitter repository (usually on GitHub) +- Understanding of the language's syntax basics + +## Step-by-Step Process + +### 1. Research the Tree-sitter Grammar + +Before starting, gather information: + +- **Grammar repository**: Find the official tree-sitter grammar repository (e.g., `https://github.com/tree-sitter-grammars/tree-sitter-LANGUAGE`) +- **Package name**: Identify the exact crate name (e.g., `tree-sitter-zig`) +- **Version**: Determine the version to use (check crates.io or user specification) +- **Node types**: Fetch the `queries/highlights.scm` file from the repository to understand node structure + +**Key files to examine in the grammar repository:** +- `queries/highlights.scm` - Shows what node types exist +- `src/node-types.json` - Complete node type definitions (if available) +- Example code in the repository's tests + +### 2. Add Workspace Dependency + +Edit `Cargo.toml` (workspace root): + +```toml +[workspace.dependencies] +# ... existing dependencies ... +tree-sitter-LANGUAGE = "VERSION" +``` + +**Example:** +```toml +tree-sitter-zig = "1.1.2" +``` + +### 3. Add Crate Dependency + +Edit `crates/codebook/Cargo.toml`: + +```toml +[dependencies] +# ... existing dependencies ... +tree-sitter-LANGUAGE.workspace = true +``` + +Add in alphabetical order with other tree-sitter dependencies. + +### 4. Update Language Type Enum + +Edit `crates/codebook/src/queries.rs`: + +Add variant to `LanguageType` enum in **alphabetical order**: + +```rust +#[derive(Debug, Clone, PartialEq, Copy)] +pub enum LanguageType { + Bash, + C, + // ... other languages ... + YourLanguage, // Add here + Zig, +} +``` + +### 5. Add Language Setting + +In `crates/codebook/src/queries.rs`, add entry to `LANGUAGE_SETTINGS` array: + +```rust +LanguageSetting { + type_: LanguageType::YourLanguage, + ids: &["language_id"], // LSP language identifier + dictionary_ids: &["language_id"], // Dictionary lookup + query: include_str!("queries/yourlanguage.scm"), + extensions: &["ext1", "ext2"], // File extensions +}, +``` + +**Important notes:** +- `ids`: Language identifiers from [VSCode language identifiers](https://code.visualstudio.com/docs/languages/identifiers) +- `extensions`: Common file extensions without the dot +- Place in the array (order doesn't matter functionally but keep consistent) + +### 6. Add Language Function Match Arm + +In `crates/codebook/src/queries.rs`, update the `language()` method in `impl LanguageSetting`: + +```rust +pub fn language(&self) -> Option { + match self.type_ { + // ... existing matches ... + LanguageType::YourLanguage => Some(tree_sitter_language::LANGUAGE.into()), + // OR if the crate has a function: + LanguageType::YourLanguage => Some(tree_sitter_language::language().into()), + } +} +``` + +**Note:** Check the tree-sitter crate's API. Most expose either: +- `LANGUAGE` constant (older style) +- `language()` function (newer style) +- `LANGUAGE_TYPENAME` for multi-language crates (e.g., `LANGUAGE_PHP`, `LANGUAGE_TYPESCRIPT`) + +### 7. Create Tree-sitter Query File + +Create `crates/codebook/src/queries/yourlanguage.scm` + +**Query file structure:** +```scheme +; Comments - capture all comment types +(line_comment) @comment +(block_comment) @comment +(doc_comment) @comment + +; Identifiers - capture DEFINITIONS only, not usages +(function_declaration + name: (identifier) @identifier) + +(variable_declaration + (identifier) @identifier) + +(parameter + (identifier) @identifier) + +; Struct/Type definitions +(struct_declaration + name: (type_identifier) @identifier) + +(field_declaration + name: (field_identifier) @identifier) + +; String literals - capture string content +(string_content) @string +(string) @string +``` + +**Critical guidelines:** +- Focus on **definitions**, not references/usages +- Capture user-defined names, not keywords +- Include comments (all types) +- Include string literals +- Test the query thoroughly - invalid queries will fail compilation + +**How to discover node types:** +1. Visit the grammar's GitHub repository +2. Check `queries/highlights.scm` for existing patterns +3. Use [Tree-sitter Playground](https://tree-sitter.github.io/tree-sitter/playground.html) to test +4. Copy sample code, paste into playground with your grammar +5. Inspect the AST structure to identify node types + +**Common node type patterns by language:** +- Identifiers: `identifier`, `IDENTIFIER`, `name` +- Strings: `string`, `string_content`, `string_literal` +- Comments: `comment`, `line_comment`, `block_comment`, `doc_comment` +- Functions: `function_declaration`, `function_definition`, `FnProto` +- Variables: `variable_declaration`, `var_decl`, `VarDecl` + +### 8. Create Example File + +Create `examples/example.LANGUAGE` with intentional spelling errors: + +**Requirements:** +- Must contain at least one spelling error (for integration tests) +- Include various language constructs: functions, variables, comments, strings +- Use realistic code patterns +- Include misspellings in: identifiers, strings, comments + +**Example structure:** +```language +// Comment with speling error +const myVarible = "Hello Wolrd"; + +function processDatta(inputt) { + const resullt = inputt + 1; + return resullt; +} +``` + +### 9. Create Test File + +Create `crates/codebook/tests/test_yourlanguage.rs`: + +**Template:** +```rust +use codebook::{ + parser::{TextRange, WordLocation}, + queries::LanguageType, +}; + +mod utils; + +#[test] +fn test_yourlanguage_location() { + utils::init_logging(); + let sample_text = r#" +// Your sample code with misspellings +const speling = "error"; +"#; + + let expected = vec![ + WordLocation::new( + "speling".to_string(), + vec![TextRange { + start_byte: 6, // Calculate exact byte positions + end_byte: 13, + }], + ), + // Add more expected misspellings + ]; + + let not_expected = ["const", "std"]; // Keywords that should NOT be flagged + + let processor = utils::get_processor(); + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::YourLanguage), None) + .to_vec(); + + println!("Misspelled words: {misspelled:?}"); + + for e in &expected { + println!("Expecting: {e:?}"); + let miss = misspelled.iter().find(|r| r.word == e.word).unwrap(); + assert_eq!(miss.locations, e.locations); + } + + for result in misspelled { + assert!(!not_expected.contains(&result.word.as_str())); + } +} +``` + +**Test requirements:** +- Include multiple types of misspellings +- Verify byte positions are exact +- Ensure keywords are NOT captured +- Test comments, strings, and identifiers separately + +### 10. Run Tests + +Execute in order: + +```bash +# 1. Verify query is valid +cargo test -p codebook queries::tests::test_all_queries_are_valid + +# 2. Run language-specific test +cargo test -p codebook test_yourlanguage + +# 3. Run all tests +cargo test -p codebook +``` + +## Common Issues and Solutions + +### Issue: Invalid query error with node type + +**Error:** `QueryError { message: "NodeTypeName", kind: NodeType }` + +**Solution:** +- The node type doesn't exist in the grammar +- Check the grammar's `queries/highlights.scm` for correct node names +- Node types are case-sensitive +- Use tree-sitter playground to verify AST structure + +### Issue: Capturing too many or too few occurrences + +**Problem:** Test fails because word appears more times than expected + +**Solution:** +- Refine query to capture only definitions, not usages +- Use field names in captures: `name: (identifier)` instead of just `(identifier)` +- Check if you're capturing both definition and reference + +### Issue: Keywords being captured + +**Problem:** Language keywords appear in misspelled words + +**Solution:** +- Don't capture `(keyword)` nodes +- Be specific in queries - use parent node context +- Only capture user-defined names + +### Issue: Wrong language() function syntax + +**Error:** Compilation error in `language()` match arm + +**Solution:** +- Check the tree-sitter crate documentation +- Try: `LANGUAGE.into()`, `language().into()`, or `LANGUAGE_VARIANT.into()` +- Look at the crate's lib.rs for the public API + +## Testing Checklist + +Before considering the implementation complete: + +- [ ] Query file compiles without errors +- [ ] `test_all_queries_are_valid` passes +- [ ] Language-specific test passes +- [ ] Example file exists with intentional errors +- [ ] All expected misspellings are caught +- [ ] No keywords are captured +- [ ] Byte positions in tests are accurate +- [ ] Comments are captured +- [ ] String literals are captured +- [ ] Identifier definitions are captured + +## File Modification Summary + +Files that MUST be modified: + +1. `Cargo.toml` - Add workspace dependency +2. `crates/codebook/Cargo.toml` - Add crate dependency +3. `crates/codebook/src/queries.rs` - Add enum variant, setting, and language match +4. `crates/codebook/src/queries/LANGUAGE.scm` - Create query file +5. `examples/example.LANGUAGE` - Create example file +6. `crates/codebook/tests/test_LANGUAGE.rs` - Create test file + +## Query File Best Practices + +1. **Start simple**: Begin with basic captures (comments, simple identifiers) +2. **Test incrementally**: Add one capture type at a time +3. **Use field names**: `name: (identifier)` is better than `(identifier)` +4. **Check highlights.scm**: The language's highlight query is your best reference +5. **Avoid ambiguity**: Be specific about what context you're capturing +6. **Comment your queries**: Explain what each section captures + +## Example: Real Implementation Reference + +For a complete reference implementation, examine existing languages: +- Simple: `queries/go.scm`, `tests/test_go.rs` +- Complex: `queries/rust.scm`, `tests/test_rust.rs` +- With strings: `queries/python.scm`, `tests/test_python.rs` + +## Byte Position Calculation + +Tests require exact byte positions. To calculate: + +1. Copy your sample text exactly as in the test +2. Count UTF-8 bytes from start of string to word start +3. Count UTF-8 bytes from start of string to word end +4. Remember: Most ASCII characters are 1 byte, but check UTF-8 encoding + +**Pro tip**: Print actual results first, then use those byte positions in your test expectations. + +## Final Verification + +Run the full test suite: +```bash +cargo test -p codebook +``` + +All tests should pass. If not, review error messages and adjust queries or test expectations. \ No newline at end of file From 83a5b538eb09e8ebeff6979425d44c79dc44422d Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Tue, 14 Oct 2025 22:44:49 -0700 Subject: [PATCH 2/2] Add LaTeX support --- Cargo.lock | 11 ++ README.md | 2 + crates/codebook/Cargo.toml | 1 + crates/codebook/src/parser.rs | 26 ++- crates/codebook/src/queries.rs | 9 + crates/codebook/src/queries/latex.scm | 5 + crates/codebook/tests/test_elixir.rs | 5 +- crates/codebook/tests/test_go.rs | 6 +- crates/codebook/tests/test_latex.rs | 234 ++++++++++++++++++++++++++ crates/codebook/tests/test_php.rs | 5 +- examples/example.tex | 111 +++++++----- journal/add_language_support.md | 18 +- word_lists/latex.txt | 2 + 13 files changed, 375 insertions(+), 60 deletions(-) create mode 100644 crates/codebook/src/queries/latex.scm create mode 100644 crates/codebook/tests/test_latex.rs create mode 100644 word_lists/latex.txt diff --git a/Cargo.lock b/Cargo.lock index 0328364..81f08b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -414,6 +414,7 @@ dependencies = [ name = "codebook" version = "0.3.13" dependencies = [ + "codebook-tree-sitter-latex", "codebook_config", "codebook_downloader", "env_logger", @@ -472,6 +473,16 @@ dependencies = [ "tower-lsp", ] +[[package]] +name = "codebook-tree-sitter-latex" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b1ee613183ae5dd1f992921539d573f9b0c38a7cefbde8e97092c3824ba2fb1" +dependencies = [ + "cc", + "tree-sitter", +] + [[package]] name = "codebook_config" version = "0.3.13" diff --git a/README.md b/README.md index f08216b..035d1cc 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,7 @@ Codebook is in active development. As better dictionaries are added, words that | Haskell | ⚠️ | | Java | ✅ | | JavaScript | ✅ | +| LaTeX | ⚠️ | | Lua | ✅ | | Markdown | ✅ | | PHP | ⚠️ | @@ -140,6 +141,7 @@ Codebook is in active development. As better dictionaries are added, words that | Rust | ✅ | | TOML | ✅ | | TypeScript | ✅ | +| Zig | ✅ | ✅ = Good to go. ⚠️ = Supported, but needs more testing. Help us improve! diff --git a/crates/codebook/Cargo.toml b/crates/codebook/Cargo.toml index 7cee939..550b4a5 100644 --- a/crates/codebook/Cargo.toml +++ b/crates/codebook/Cargo.toml @@ -39,6 +39,7 @@ tree-sitter-haskell.workspace = true tree-sitter-html.workspace = true tree-sitter-java.workspace = true tree-sitter-javascript.workspace = true +codebook-tree-sitter-latex.workspace = true tree-sitter-lua.workspace = true tree-sitter-php.workspace = true tree-sitter-python.workspace = true diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs index 031197c..4afa164 100644 --- a/crates/codebook/src/parser.rs +++ b/crates/codebook/src/parser.rs @@ -2,12 +2,12 @@ use crate::splitter::{self}; use crate::queries::{LanguageType, get_language_setting}; use regex::Regex; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use streaming_iterator::StreamingIterator; use tree_sitter::{Parser, Query, QueryCursor}; use unicode_segmentation::UnicodeSegmentation; -#[derive(Debug, Clone, Copy, PartialEq, Ord, Eq, PartialOrd)] +#[derive(Debug, Clone, Copy, PartialEq, Ord, Eq, PartialOrd, Hash)] pub struct TextRange { /// Start position in utf-8 byte offset pub start_byte: usize, @@ -195,7 +195,7 @@ fn find_locations_code( let query = Query::new(&language, language_setting.query).unwrap(); let mut cursor = QueryCursor::new(); - let mut word_locations: HashMap> = HashMap::new(); + let mut word_locations: HashMap> = HashMap::new(); let provider = text.as_bytes(); let mut matches_query = cursor.matches(&query, root_node, provider); @@ -217,13 +217,18 @@ fn find_locations_code( end_byte: range.end_byte + node_start_byte, }; if let Some(existing_result) = word_locations.get_mut(&word_pos.word) { + let added = existing_result.insert(location); #[cfg(debug_assertions)] - if existing_result.contains(&location) { - panic!("Two of the same locations found. Make a better query.") + if !added { + let word = word_pos.word.clone(); + panic!( + "Two of the same locations found. Make a better query. Word: {word}, Location: {location:?}" + ) } - existing_result.push(location); } else { - word_locations.insert(word_pos.word.clone(), vec![location]); + let mut set = HashSet::new(); + set.insert(location); + word_locations.insert(word_pos.word.clone(), set); } } } @@ -235,7 +240,12 @@ fn find_locations_code( .keys() .map(|word| WordLocation { word: word.clone(), - locations: word_locations.get(word).cloned().unwrap_or_default(), + locations: word_locations + .get(word) + .cloned() + .unwrap_or_default() + .into_iter() + .collect(), }) .collect() } diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs index e4ecd6a..8d29392 100644 --- a/crates/codebook/src/queries.rs +++ b/crates/codebook/src/queries.rs @@ -14,6 +14,7 @@ pub enum LanguageType { HTML, Java, Javascript, + Latex, Lua, Php, Python, @@ -106,6 +107,13 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[ query: include_str!("queries/javascript.scm"), extensions: &["js", "jsx"], }, + LanguageSetting { + type_: LanguageType::Latex, + ids: &["latex"], + dictionary_ids: &["latex"], + query: include_str!("queries/latex.scm"), + extensions: &["tex", "latex", "ltx"], + }, LanguageSetting { type_: LanguageType::Typescript, ids: &["typescript", "typescriptreact"], @@ -216,6 +224,7 @@ impl LanguageSetting { LanguageType::HTML => Some(tree_sitter_html::LANGUAGE.into()), LanguageType::Java => Some(tree_sitter_java::LANGUAGE.into()), LanguageType::Javascript => Some(tree_sitter_javascript::LANGUAGE.into()), + LanguageType::Latex => Some(codebook_tree_sitter_latex::language()), LanguageType::Lua => Some(tree_sitter_lua::LANGUAGE.into()), LanguageType::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()), LanguageType::Python => Some(tree_sitter_python::LANGUAGE.into()), diff --git a/crates/codebook/src/queries/latex.scm b/crates/codebook/src/queries/latex.scm new file mode 100644 index 0000000..1cc2017 --- /dev/null +++ b/crates/codebook/src/queries/latex.scm @@ -0,0 +1,5 @@ +; Comments - capture LaTeX comments (lines starting with %) +(line_comment) @comment + +; Text content in the document +(text) @string diff --git a/crates/codebook/tests/test_elixir.rs b/crates/codebook/tests/test_elixir.rs index ad003dd..0ee2d86 100644 --- a/crates/codebook/tests/test_elixir.rs +++ b/crates/codebook/tests/test_elixir.rs @@ -163,7 +163,10 @@ fn test_elixir_module() { println!("Expecting {}", expect.word); let result = misspelled.iter().find(|r| r.word == expect.word).unwrap(); assert_eq!(result.word, expect.word); - assert_eq!(result.locations, expect.locations); + assert!(result.locations.len() == expect.locations.len()); + for location in result.locations.iter() { + assert!(expect.locations.contains(location)) + } } } diff --git a/crates/codebook/tests/test_go.rs b/crates/codebook/tests/test_go.rs index 0815aba..db51c08 100644 --- a/crates/codebook/tests/test_go.rs +++ b/crates/codebook/tests/test_go.rs @@ -245,7 +245,11 @@ fn test_go_location() { for e in &expected { println!("Expecting: {e:?}"); let miss = misspelled.iter().find(|r| r.word == e.word).unwrap(); - assert_eq!(miss.locations, e.locations); + // assert_eq!(miss.locations, e.locations); + assert!(miss.locations.len() == e.locations.len()); + for location in &miss.locations { + assert!(e.locations.contains(location)); + } } for result in misspelled { assert!(!not_expected.contains(&result.word.as_str())); diff --git a/crates/codebook/tests/test_latex.rs b/crates/codebook/tests/test_latex.rs new file mode 100644 index 0000000..1601ae1 --- /dev/null +++ b/crates/codebook/tests/test_latex.rs @@ -0,0 +1,234 @@ +use codebook::{ + parser::{TextRange, WordLocation}, + queries::LanguageType, +}; + +mod utils; + +#[test] +fn test_latex_comments() { + utils::init_logging(); + let sample_text = r#" +% This is a coment with a typo +% Another commnet with wrng spelling +\documentclass{article} + "#; + let expected = vec![ + WordLocation::new( + "coment".to_string(), + vec![TextRange { + start_byte: 13, + end_byte: 19, + }], + ), + WordLocation::new( + "commnet".to_string(), + vec![TextRange { + start_byte: 42, + end_byte: 49, + }], + ), + WordLocation::new( + "wrng".to_string(), + vec![TextRange { + start_byte: 55, + end_byte: 59, + }], + ), + ]; + let not_expected = vec!["documentclass", "article"]; + let processor = utils::get_processor(); + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Latex), None) + .to_vec(); + println!("Misspelled words: {misspelled:?}"); + for e in &expected { + println!("Expecting: {e:?}"); + let miss = misspelled + .iter() + .find(|r| r.word == e.word) + .unwrap_or_else(|| panic!("Word '{}' not found in misspelled list", e.word)); + assert_eq!(miss.locations, e.locations); + } + for word in not_expected { + assert!(!misspelled.iter().any(|r| r.word == word)); + } +} + +#[test] +fn test_latex_text_content() { + utils::init_logging(); + let sample_text = r#" +\section{Introducton} + +This is an exampl of text with speling errors. + "#; + let expected = vec!["Introducton", "exampl", "speling"]; + let processor = utils::get_processor(); + let binding = processor + .spell_check(sample_text, Some(LanguageType::Latex), None) + .to_vec(); + let mut misspelled = binding + .iter() + .map(|r| r.word.as_str()) + .collect::>(); + misspelled.sort(); + println!("Misspelled words: {misspelled:?}"); + assert_eq!(misspelled, expected); +} + +#[test] +fn test_latex_sections_and_text() { + utils::init_logging(); + let sample_text = r#" +\section{Methology} + +The methology section describs the approach. + +\subsection{Bakground} + +In this secion we discuss importnt concepts. + "#; + let expected = vec![ + "Bakground", + "Methology", + "describs", + "importnt", + "methology", + "secion", + ]; + let processor = utils::get_processor(); + let binding = processor + .spell_check(sample_text, Some(LanguageType::Latex), None) + .to_vec(); + let mut misspelled = binding + .iter() + .map(|r| r.word.as_str()) + .collect::>(); + misspelled.sort(); + println!("Misspelled words: {misspelled:?}"); + assert_eq!(misspelled, expected); +} + +#[test] +fn test_latex_itemize() { + utils::init_logging(); + let sample_text = r#" +\begin{itemize} + \item First itm with algoritm + \item Second itm about formulas +\end{itemize} + "#; + let expected = vec!["algoritm", "itm"]; + let processor = utils::get_processor(); + let binding = processor + .spell_check(sample_text, Some(LanguageType::Latex), None) + .to_vec(); + let mut misspelled = binding + .iter() + .map(|r| r.word.as_str()) + .collect::>(); + misspelled.sort(); + println!("Misspelled words: {misspelled:?}"); + assert_eq!(misspelled, expected); +} + +#[test] +fn test_latex_mixed_content() { + utils::init_logging(); + let sample_text = r#" +% Comment: calcuate the result +\section{Resuts} + +The resuts show our aproach is efective. + +\begin{equation} + E = mc^2 \label{eq:enrgy} +\end{equation} + +As shown in Equation~\ref{eq:enrgy}, the relatioship is clear. + "#; + let expected = vec![ + "Resuts", + "aproach", + "calcuate", + "efective", + "enrgy", + "relatioship", + "resuts", + ]; + let not_expected = vec!["equation", "label", "ref", "begin", "end", "section"]; + let processor = utils::get_processor(); + let binding = processor + .spell_check(sample_text, Some(LanguageType::Latex), None) + .to_vec(); + let mut misspelled = binding + .iter() + .map(|r| r.word.as_str()) + .collect::>(); + misspelled.sort(); + println!("Misspelled words: {misspelled:?}"); + assert_eq!(misspelled, expected); + for word in not_expected { + assert!(!misspelled.contains(&word)); + } +} + +#[test] +fn test_latex_comprehensive() { + utils::init_logging(); + let sample_text = r#" +\documentclass{article} + +% This coment has typos: wrng and speling +\title{A Sampel Document} + +\begin{document} + +\section{Introducton} + +This docment demonstrates the spel checker. + +\subsection{Analyss} + +The analyss reveals paterns in the data. + +\end{document} + "#; + let expected = vec![ + "Analyss", + "Introducton", + "Sampel", + "analyss", + "coment", + "docment", + "paterns", + "spel", + "speling", + "wrng", + ]; + let not_expected = vec![ + "documentclass", + "article", + "title", + "begin", + "end", + "document", + "section", + "subsection", + ]; + let processor = utils::get_processor(); + let binding = processor + .spell_check(sample_text, Some(LanguageType::Latex), None) + .to_vec(); + let mut misspelled = binding + .iter() + .map(|r| r.word.as_str()) + .collect::>(); + misspelled.sort(); + println!("Misspelled words: {misspelled:?}"); + assert_eq!(misspelled, expected); + for word in not_expected { + assert!(!misspelled.contains(&word)); + } +} diff --git a/crates/codebook/tests/test_php.rs b/crates/codebook/tests/test_php.rs index be80ae0..166fcdf 100644 --- a/crates/codebook/tests/test_php.rs +++ b/crates/codebook/tests/test_php.rs @@ -204,7 +204,10 @@ $userDetails = $userr->getUserDeetails(); .iter() .find(|r| r.word == e.word) .expect("Word not found"); - assert_eq!(miss.locations, e.locations); + assert!(miss.locations.len() == e.locations.len()); + for location in &miss.locations { + assert!(e.locations.contains(location)); + } } for result in misspelled { diff --git a/examples/example.tex b/examples/example.tex index 07271fc..4abe015 100644 --- a/examples/example.tex +++ b/examples/example.tex @@ -1,46 +1,73 @@ -\documentclass[12pt]{article} -\usepackage{lingmacros} -\usepackage{tree-dvips} +\documentclass{article} +\usepackage{amsmath} +\usepackage{graphicx} + +\title{A Sampel Document with Speling Erors} +\author{John Doe} +\date{\today} + \begin{document} -\section*{Notes for My Paper} - -Don't forget to include examples of topicalization. -They look like this: - -{\small -\enumsentence{Topicalization from sentential subject:\\ -\shortex{7}{a John$_i$ [a & kltukl & [el & - {\bf l-}oltoir & er & ngii$_i$ & a Mary]]} -{ & {\bf R-}clear & {\sc comp} & - {\bf IR}.{\sc 3s}-love & P & him & } -{John, (it's) clear that Mary loves (him).}} -} - -\subsection*{How to handle topicalization} - -I'll just assume a tree structure like (\ex{1}). - -{\small -\enumsentence{Structure of A$'$ Projections:\\ [2ex] -\begin{tabular}[t]{cccc} - & \node{i}{CP}\\ [2ex] - \node{ii}{Spec} & &\node{iii}{C$'$}\\ [2ex] - &\node{iv}{C} & & \node{v}{SAgrP} -\end{tabular} -\nodeconnect{i}{ii} -\nodeconnect{i}{iii} -\nodeconnect{iii}{iv} -\nodeconnect{iii}{v} -} -} - -\subsection*{Mood} - -Mood changes when there is a topic, as well as when -there is WH-movement. \emph{Irrealis} is the mood when -there is a non-subject topic or WH-phrase in Comp. -\emph{Realis} is the mood when there is a subject topic -or WH-phrase. +\maketitle + +% This is a coment with a typo: calcuate +% Another commnet with wrng spellingg + +\section{Introducton} + +This is an exampl of a LaTeX document with intentinal spelling errors. +The purpse of this docment is to test the spel checker functionality. + +\subsection{Bakground} + +In this secion, we will discuss some importnt concepts. +The folowing list contains severl items: + +\begin{itemize} + \item First itm with a typo: algoritm + \item Second itm about mathmatical formulas + \item Third itm discussing resarch methods +\end{itemize} + +\section{Methology} + +The methology section describs the approach used in this reserch. +We will demonstate several techniqes for procesing data. + +\subsection{Data Colection} + +Data was colected from varous sources including: +\begin{enumerate} + \item Primry sources + \item Secondry sources + \item Teriary references +\end{enumerate} + +\section{Resuts} + +The resuts show that our aproach is efective. +We can see this in the folowing equation: + +\begin{equation} + E = mc^2 \label{eq:enrgy} +\end{equation} + +As shown in Equation~\ref{eq:enrgy}, the relatioship is clear. + +\subsection{Analyss} + +The analyss reveals severl interesting paternzs. +These paterns suggest that our hypothsis was corect. + +% Comment: More detals needed here + +\section{Concluson} + +In concluson, this docment demonstrates variuos spelling errors +that should be detectd by the spell checker. The implimentation +should be able to identfy errors in text, coments, and labels. + +\bibliographystyle{plain} +\bibliography{refereces} \end{document} diff --git a/journal/add_language_support.md b/journal/add_language_support.md index 41fd43e..bb799b1 100644 --- a/journal/add_language_support.md +++ b/journal/add_language_support.md @@ -204,7 +204,7 @@ fn test_yourlanguage_location() { // Your sample code with misspellings const speling = "error"; "#; - + let expected = vec![ WordLocation::new( "speling".to_string(), @@ -215,22 +215,22 @@ const speling = "error"; ), // Add more expected misspellings ]; - + let not_expected = ["const", "std"]; // Keywords that should NOT be flagged - + let processor = utils::get_processor(); let misspelled = processor .spell_check(sample_text, Some(LanguageType::YourLanguage), None) .to_vec(); - + println!("Misspelled words: {misspelled:?}"); - + for e in &expected { println!("Expecting: {e:?}"); let miss = misspelled.iter().find(|r| r.word == e.word).unwrap(); assert_eq!(miss.locations, e.locations); } - + for result in misspelled { assert!(!not_expected.contains(&result.word.as_str())); } @@ -260,6 +260,10 @@ cargo test -p codebook ## Common Issues and Solutions +### Issue: CamelCase words are getting split + +- Codebook processing splits words based on common word boundaries in programming like CamelCase and snake_case. Expect that when making tests. + ### Issue: Invalid query error with node type **Error:** `QueryError { message: "NodeTypeName", kind: NodeType }` @@ -357,4 +361,4 @@ Run the full test suite: cargo test -p codebook ``` -All tests should pass. If not, review error messages and adjust queries or test expectations. \ No newline at end of file +All tests should pass. If not, review error messages and adjust queries or test expectations. diff --git a/word_lists/latex.txt b/word_lists/latex.txt new file mode 100644 index 0000000..18ee27f --- /dev/null +++ b/word_lists/latex.txt @@ -0,0 +1,2 @@ +bibliographystyle +emph