celloopa · celloopa · Jan 16, 2026 · Jan 16, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 
 ## [Unreleased]
 
+### Added
+
+- **Microsoft Careers Fetcher** ([#18](https://github.com/celloopa/ghosted/issues/18))
+  - Added specialized extractor for `careers.microsoft.com` and `apply.careers.microsoft.com`
+  - Parses job data from Next.js `__NEXT_DATA__` JSON embedded in page
+  - Extracts title, description, qualifications, responsibilities, location, and employment type
+  - Falls back to meta tags when JSON parsing fails
+  - Validates data to reject numeric company names and empty positions
+  - 7 new tests for comprehensive coverage
+
 ## [0.7.0-beta] - 2026-01-16
 
 ### Added

diff --git a/PROGRESS.md b/PROGRESS.md
@@ -1,6 +1,6 @@
 # Multi-Agent Document Generation Pipeline - Progress Tracker
 
-> **Last Updated:** 2026-01-16 (unified fetch command complete)
+> **Last Updated:** 2026-01-16 (Microsoft fetcher #18 completed)
 > **Project:** ghosted
 > **Kanban Project ID:** `b666852b-0ef9-4ee0-8d91-a7f341697897`
 > **GitHub Repo:** `celloopa/ghosted`
@@ -48,6 +48,12 @@ When a user drops a job posting into `local/postings/`, agents will:
 | `[x]` | Add `ghosted apply` CLI command | `b9615c5d-fd52-418c-89da-4bec8c724f83` | ✅ Implemented with --dry-run, --auto-approve |
 | `[ ]` | Add watch mode for automatic processing | `2cdc1317-ddc0-408b-ab3d-b6fb92e2887b` | Nice-to-have: monitor folder |
 
+### Bug Fixes & Improvements
+
+| Status | Task | Issue | Notes |
+|--------|------|-------|-------|
+| `[x]` | Add Microsoft Careers site fetcher | [#18](https://github.com/celloopa/ghosted/issues/18) | ✅ `extractMicrosoft()` parses __NEXT_DATA__ JSON. 7 tests added. |
+
 ### Phase 4: Agent Automation & Training Data (After Phase 3)
 
 *These tasks depend on Phase 3 completion. Priority order within phase:*
@@ -69,6 +75,7 @@ All remaining tasks are tracked as GitHub issues. Each issue includes full imple
 
 | Issue | Title | Phase |
 |-------|-------|-------|
+| [#18](https://github.com/celloopa/ghosted/issues/18) | Add Microsoft Careers site fetcher | Bug Fix (In Progress) |
 | [#1](https://github.com/celloopa/ghosted/issues/1) | Implement Resume Generator Agent | Core |
 | [#2](https://github.com/celloopa/ghosted/issues/2) | Implement Cover Letter Generator Agent | Core |
 | [#3](https://github.com/celloopa/ghosted/issues/3) | Implement Hiring Manager Review Agent | Core |
@@ -204,8 +211,8 @@ ghosted watch --auto-approve               # Auto-approve all
 - `ghosted compile <id|dir>` command - compiles Typst to PDF and updates tracker
 - `ghosted cv fetch <website>` command - fetches CV from remote websites
 
-**Next task to work on:** Phase 4 improvements:
-1. Add `--non-interactive` flag to ghosted apply (`bdfff0fc`) - for AI agent usage
+**Next task:**
+- Add `--non-interactive` flag to ghosted apply (`bdfff0fc`) - for AI agent usage
 
 **Blockers:** None - Phase 3 complete!
 
@@ -343,6 +350,34 @@ Replace Claude API calls with local model inference:
 
 ## Completed Work Log
 
+### 2026-01-16: Microsoft Careers Site Fetcher (#18)
+
+Added specialized extractor for `careers.microsoft.com` and `apply.careers.microsoft.com` URLs.
+
+**Problem solved:** Microsoft Careers uses React/Next.js with job data embedded in `<script id="__NEXT_DATA__">` JSON. The generic fetcher was returning truncated content.
+
+**Files modified:**
+- `internal/fetch/fetcher.go` - Added `extractMicrosoft()` and helper functions
+- `internal/fetch/fetcher_test.go` - Added 7 new tests
+
+**Implementation:**
+- `extractMicrosoft()` - Main extractor, parses __NEXT_DATA__ JSON
+- `parseMicrosoftNextData()` - Navigates JSON structure (job, jobDetail, data)
+- `extractMicrosoftJobData()` - Extracts title, description, qualifications, responsibilities
+- `validateMicrosoftExtraction()` - Rejects numeric company names, empty/numeric positions
+- `isNumeric()` - Helper for validation
+
+**Test coverage:**
+- `TestFetcher_ExtractMicrosoft_NextData` - Full __NEXT_DATA__ parsing
+- `TestFetcher_ExtractMicrosoft_FallbackToMeta` - Meta tag fallback
+- `TestFetcher_ExtractMicrosoft_JobDetail` - Alternate JSON structure
+- `TestFetcher_ExtractMicrosoft_QualificationsArray` - Array handling
+- `TestIsNumeric` - Numeric validation
+- `TestValidateMicrosoftExtraction` - Data validation
+- `TestFetcher_ExtractMicrosoft_ApplySubdomain` - Subdomain support
+
+---
+
 ### 2026-01-16: Unified Fetch Command + TUI Fetch View
 
 Merged job posting fetch and CV fetch into a single `ghosted fetch` command with auto-detection.

diff --git a/README.md b/README.md
@@ -147,6 +147,7 @@ ghosted fetch https://example.com/cv.json     # Explicit CV URL
 **Supported job boards:**
 - Lever (`jobs.lever.co`)
 - Greenhouse (`boards.greenhouse.io`)
+- Microsoft Careers (`careers.microsoft.com`)
 - Workday
 - LinkedIn Jobs
 - Ashby

diff --git a/internal/fetch/fetcher.go b/internal/fetch/fetcher.go
@@ -1,6 +1,7 @@
 package fetch
 
 import (
+	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
@@ -176,6 +177,8 @@ func (f *Fetcher) ExtractJobPosting(html string, parsedURL *url.URL) (content, c
 		return f.extractLinkedIn(html)
 	case strings.Contains(host, "ashbyhq.com"):
 		return f.extractAshby(html)
+	case strings.Contains(host, "careers.microsoft.com"):
+		return f.extractMicrosoft(html)
 	default:
 		return f.extractGeneric(html)
 	}
@@ -278,6 +281,184 @@ func (f *Fetcher) extractAshby(html string) (content, company, position string)
 	return content, company, position
 }
 
+// extractMicrosoft extracts job posting from Microsoft Careers pages
+// Microsoft uses Next.js with job data in <script id="__NEXT_DATA__"> JSON
+func (f *Fetcher) extractMicrosoft(html string) (content, company, position string) {
+	// Default company name
+	company = "Microsoft"
+
+	// Try to extract from __NEXT_DATA__ JSON
+	nextDataStart := strings.Index(html, `<script id="__NEXT_DATA__"`)
+	if nextDataStart != -1 {
+		// Find the start of the JSON content
+		jsonStart := strings.Index(html[nextDataStart:], ">")
+		if jsonStart != -1 {
+			jsonStart += nextDataStart + 1
+			jsonEnd := strings.Index(html[jsonStart:], "</script>")
+			if jsonEnd != -1 {
+				jsonData := html[jsonStart : jsonStart+jsonEnd]
+				content, position = f.parseMicrosoftNextData(jsonData)
+			}
+		}
+	}
+
+	// Fallback to meta tags if __NEXT_DATA__ parsing failed
+	if position == "" {
+		position = extractMetaContent(html, "og:title")
+		// Clean up Microsoft title format: "Job Title | Microsoft Careers"
+		if idx := strings.Index(position, " | "); idx != -1 {
+			position = position[:idx]
+		}
+	}
+
+	if content == "" {
+		content = extractMetaContent(html, "og:description")
+	}
+
+	// Validate extracted data
+	content, company, position = f.validateMicrosoftExtraction(content, company, position)
+
+	content = cleanHTML(content)
+	company = cleanText(company)
+	position = cleanText(position)
+
+	return content, company, position
+}
+
+// parseMicrosoftNextData parses the __NEXT_DATA__ JSON and extracts job details
+func (f *Fetcher) parseMicrosoftNextData(jsonData string) (content, position string) {
+	var data map[string]interface{}
+	if err := json.Unmarshal([]byte(jsonData), &data); err != nil {
+		return "", ""
+	}
+
+	// Navigate to props.pageProps where job data typically lives
+	props, ok := data["props"].(map[string]interface{})
+	if !ok {
+		return "", ""
+	}
+
+	pageProps, ok := props["pageProps"].(map[string]interface{})
+	if !ok {
+		return "", ""
+	}
+
+	// Try to find job data - Microsoft uses various structures
+	// Common paths: pageProps.job, pageProps.jobDetail, pageProps.data
+
+	// Try pageProps.job first
+	if job, ok := pageProps["job"].(map[string]interface{}); ok {
+		return f.extractMicrosoftJobData(job)
+	}
+
+	// Try pageProps.jobDetail
+	if jobDetail, ok := pageProps["jobDetail"].(map[string]interface{}); ok {
+		return f.extractMicrosoftJobData(jobDetail)
+	}
+
+	// Try pageProps.data
+	if dataObj, ok := pageProps["data"].(map[string]interface{}); ok {
+		return f.extractMicrosoftJobData(dataObj)
+	}
+
+	// Try pageProps directly (sometimes job data is at this level)
+	return f.extractMicrosoftJobData(pageProps)
+}
+
+// extractMicrosoftJobData extracts content and position from a job data object
+func (f *Fetcher) extractMicrosoftJobData(job map[string]interface{}) (content, position string) {
+	// Extract position/title
+	for _, key := range []string{"title", "jobTitle", "name", "positionTitle"} {
+		if val, ok := job[key].(string); ok && val != "" {
+			position = val
+			break
+		}
+	}
+
+	// Extract description/content
+	var descParts []string
+
+	// Try various description fields
+	for _, key := range []string{"description", "jobDescription", "fullDescription", "summary"} {
+		if val, ok := job[key].(string); ok && val != "" {
+			descParts = append(descParts, val)
+		}
+	}
+
+	// Try qualifications
+	if quals, ok := job["qualifications"].(string); ok && quals != "" {
+		descParts = append(descParts, "\n\n## Qualifications\n\n"+quals)
+	} else if quals, ok := job["qualifications"].([]interface{}); ok {
+		descParts = append(descParts, "\n\n## Qualifications\n")
+		for _, q := range quals {
+			if qs, ok := q.(string); ok {
+				descParts = append(descParts, "- "+qs)
+			}
+		}
+	}
+
+	// Try responsibilities
+	if resp, ok := job["responsibilities"].(string); ok && resp != "" {
+		descParts = append(descParts, "\n\n## Responsibilities\n\n"+resp)
+	} else if resp, ok := job["responsibilities"].([]interface{}); ok {
+		descParts = append(descParts, "\n\n## Responsibilities\n")
+		for _, r := range resp {
+			if rs, ok := r.(string); ok {
+				descParts = append(descParts, "- "+rs)
+			}
+		}
+	}
+
+	// Try location info
+	if loc, ok := job["location"].(string); ok && loc != "" {
+		descParts = append(descParts, "\n\n**Location:** "+loc)
+	} else if loc, ok := job["primaryLocation"].(string); ok && loc != "" {
+		descParts = append(descParts, "\n\n**Location:** "+loc)
+	}
+
+	// Try employment type
+	if empType, ok := job["employmentType"].(string); ok && empType != "" {
+		descParts = append(descParts, "\n\n**Employment Type:** "+empType)
+	}
+
+	content = strings.Join(descParts, "\n")
+	return content, position
+}
+
+// validateMicrosoftExtraction validates and cleans up extracted Microsoft data
+func (f *Fetcher) validateMicrosoftExtraction(content, company, position string) (string, string, string) {
+	// Reject numeric company names (indicates parsing error)
+	if isNumeric(company) {
+		company = "Microsoft"
+	}
+
+	// Reject empty or very short positions
+	if len(strings.TrimSpace(position)) < 3 {
+		position = ""
+	}
+
+	// Reject positions that are just numbers
+	if isNumeric(position) {
+		position = ""
+	}
+
+	return content, company, position
+}
+
+// isNumeric checks if a string contains only digits
+func isNumeric(s string) bool {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return false
+	}
+	for _, r := range s {
+		if r < '0' || r > '9' {
+			return false
+		}
+	}
+	return true
+}
+
 // extractGeneric extracts job posting from any HTML page
 func (f *Fetcher) extractGeneric(html string) (content, company, position string) {
 	// Try common meta tags