## This notebook outlines an outline for using AI tools for text classification and clustering.

user supplied info, that we will pass to the AI model includes:

In [35]:
import json
import pandas as pd

In [36]:
reasoning_level='high' # low, medium, high
corpus_fullname='Corpus of Contemporary American English' # Corpus of Contemporary American English, Glowlbe, EcoLexicon
corpus_shortname='COCA' # coca, glowbe, ecolexicon
keyword='national system' # best system, best method, national system, industry standard
#classifications=['literal','figurative','neither','unclear'] # adjust as needed
random_KWIC_sample=100

path = "./kwic_coca_national_system_annotations_export.json"


In [37]:
#path = "./kwic_coca_industry_standard_annotations_export.json"
#path = "./kwic_coca_national_system_annotations_export.json"

with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Flatten the annotations JSON structure into a DataFrame
rows = []

# The data has 'annotations' as the top-level key
annotations = data.get('annotations', {})

for genre_id, items_dict in annotations.items():
    # items_dict is a dictionary where keys are string indices
    for idx_str, item in items_dict.items():
        row = {
            'genre_id': genre_id,
            'genre_idx': int(idx_str),
            'text_id': item.get('text_id'),
            #'match': item.get('match'),
            'context': item.get('context'),
            #'full_text': item.get('full_text'),
            #'classification': item.get('classification'),
            #'notes': item.get('notes'),
            'timestamp': item.get('timestamp')
        }
        rows.append(row)

# Create the flattened DataFrame
df_flat = pd.DataFrame(rows)

# Sort by genre_id and genre_idx for better readability
df_flat = df_flat.sort_values(['genre_id', 'genre_idx']).reset_index(drop=True)

print(f"Flattened dataframe shape: {df_flat.shape}")
print(f"Columns: {list(df_flat.columns)}")
df_flat.head(5)

Flattened dataframe shape: (51, 5)
Columns: ['genre_id', 'genre_idx', 'text_id', 'context', 'timestamp']


Unnamed: 0,genre_id,genre_idx,text_id,context,timestamp
0,acad_1991,0,159,147 ) . 16 <p> A rapid diffusion of rural scho...,2025-11-21T21:30:29.339228
1,acad_1992,0,13,"tariffs for infant industries , and public dev...",2025-11-22T13:52:22.167679
2,acad_1993,3,386,of social and cultural movements that were rea...,2025-11-21T21:41:53.940823
3,acad_1993,5,797,healthcare system . <p> A distinguished study ...,2025-11-22T13:53:35.114754
4,acad_1996,2,187,"quo to continue , over mutual cooperation , wh...",2025-11-22T13:54:24.086793


### Create AI model

In [38]:
# COCA Computational Forensic Linguistics Agent
# Adapted from SCOTUS analysis tools for corpus linguistics analysis

from langchain.tools import BaseTool
from langchain.pydantic_v1 import BaseModel, Field
from langchain.chat_models import init_chat_model
from typing import Optional, Type, Dict, Any, Union, List
import json
import re
from datetime import datetime

# Initialize AWS Bedrock model for COCA forensic linguistics analysis
model_id = 'openai.gpt-oss-120b-1:0'  # 128K context window
max_tokens = 128000

model = init_chat_model(
    model_id, 
    model_provider="bedrock_converse",
    #model_provider="bedrock",
    credentials_profile_name='atn-developer',  # Adjust to your AWS profile (see https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-files.html)
    max_tokens=max_tokens,
    temperature=0.0,
    #temperature=1.0,

)

print(f"‚úÖ AWS Bedrock model initialized: {model_id}")
print(f"üî¨ AI Assisted Forensic Linguistics Tool Ready")
#print(f"üìä Available COCA genres: {list(coca_corpus.keys())}")
#print(f"üìä Available GloWbe genres: {list(glowbe_corpus.keys())}")


‚úÖ AWS Bedrock model initialized: openai.gpt-oss-120b-1:0
üî¨ AI Assisted Forensic Linguistics Tool Ready


## Test model connectivity to AWS Bedrock

In [40]:
model.invoke('What is the capital of Pennsylvania?')

AIMessage(content=[{'type': 'reasoning_content', 'reasoning_content': {'text': 'The user asks: "What is the capital of Pennsylvania?" Straightforward factual question. Answer: Harrisburg. Provide answer.', 'signature': ''}}, {'type': 'text', 'text': 'The capital of Pennsylvania is **Harrisburg**.'}], additional_kwargs={}, response_metadata={'ResponseMetadata': {'RequestId': '56655a46-0c37-4115-bf98-e97163131cd1', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 24 Nov 2025 15:52:00 GMT', 'content-type': 'application/json', 'content-length': '427', 'connection': 'keep-alive', 'x-amzn-requestid': '56655a46-0c37-4115-bf98-e97163131cd1'}, 'RetryAttempts': 0}, 'stopReason': 'end_turn', 'metrics': {'latencyMs': [36346]}, 'model_name': 'openai.gpt-oss-120b-1:0'}, id='run--fbfa85e8-8fb6-4817-a4a6-8d2c64c62f32-0', usage_metadata={'input_tokens': 76, 'output_tokens': 46, 'total_tokens': 122, 'input_token_details': {'cache_creation': 0, 'cache_read': 0}})

### Let's create a base tool class for user supplied corpus analysis

In [41]:
from langchain.tools import BaseTool
from langchain.pydantic_v1 import BaseModel, Field
from langchain.chat_models import init_chat_model
from typing import Optional, Type, Dict, Any, Union, List
import json
import re
from datetime import datetime

class KWICAnalysisInput(BaseModel):
    """Input schema for KWIC analysis tool"""
    reasoning_level: str = Field(default="high", description="Reasoning level: 'low', 'medium', or 'high'")
    corpus_fullname: str = Field(description="Full corpus name (e.g. 'Corpus of Contemporary American English')")
    corpus_shortname: str = Field(description="Short corpus id (e.g. 'COCA')")
    keyword: str = Field(description="Search keyword or phrase")
    #classifications: List[str] = Field(description="Allowed classification labels")
    random_KWIC_sample: int = Field(default=30, description="Number of random KWIC samples to analyze")
    kwic_data: Dict[str, Any] = Field(description="KWIC concordance data (JSON structure)")

    
class KWICAnalysisTool(BaseTool):
    """Tool for analyzing KWIC concordance data using AI"""
    name: str = "kwic_analysis"
    description: str = """Analyzes KWIC (Key Word in Context) concordance data from corpus linguistics.
    Provides AI-assisted classification and analysis of concordance lines."""
    args_schema: Type[BaseModel] = KWICAnalysisInput
    
    # Store model configuration
    model: Any = None
    
    def _run(
        self,
        reasoning_level: str,
        corpus_fullname: str,
        corpus_shortname: str,
        keyword: str,
        #classifications: List[str],
        random_KWIC_sample: int,
        kwic_data: Dict[str, Any]
    ) -> str:
        """Execute KWIC analysis using Bedrock AI model"""
        
        # Prepare corpus metadata
        metadata = {
            "corpus_name": corpus_fullname,
            "corpus_id": corpus_shortname,
            "keyword": keyword,
            "reasoning_level": reasoning_level,
            #"classification_options": classifications,
            "timestamp": datetime.now().isoformat()
        }
        
        # Count total concordance lines
        total_lines = 0
        if 'annotations' in kwic_data:
            for genre_id, items in kwic_data['annotations'].items():
                total_lines += len(items)
        
        # take a random sample of concordance lines if specified
        if random_KWIC_sample > 0 and total_lines > random_KWIC_sample:
            import random
            sampled_items = {}
            all_items = []
            for genre_id, items in kwic_data['annotations'].items():
                for idx_str, item in items.items():
                    all_items.append((genre_id, idx_str, item))
            sampled = random.sample(all_items, random_KWIC_sample)
            for genre_id, idx_str, item in sampled:
                if genre_id not in sampled_items:
                    sampled_items[genre_id] = {}
                sampled_items[genre_id][idx_str] = item
            kwic_data['annotations'] = sampled_items

        # Construct prompt for Bedrock
        prompt = f"""You are a forensic linguistics expert analyzing concordance data from {corpus_fullname} ({corpus_shortname}).

                    **Analysis Task:**
                    - Reasoning level: {reasoning_level}
                    - Corpus Fullname: {corpus_fullname}
                    - Corpus Name: {corpus_shortname}
                    - Keyword/Phrase: "{keyword}"
                    - Total concordance lines: {total_lines}
                    - Random KWIC sample size: {random_KWIC_sample}

                    **Corpus Metadata:**
                    {json.dumps(metadata, indent=2)}

                    **KWIC Concordance Data:**
                    {json.dumps(kwic_data, indent=2)}

                    **Instructions:**
                    Focus on detailed linguistic analysis of concordance line patterns given the keyword/phrase of interest.
                    Do not reference information beyond what is provided in the text, and if you are not certain, indicate uncertainty.
                    The question to answer is "given the context of the full concordance line text provided, in what sense did the speaker mean when they used the keyword/phrase?"

                    1. Collocates
                    2. Semantic Prosody
                    3. Grammatical Patterns
                    4. Semantic Domains
                    5. Evaluation/Comparison
                    6. Qualification/Mitigation
                    7. Speaker/Writer Stance

                    Please provide a structured analysis with:
                    - Individual line classifications with reasoning
                    - Pattern identification
                    - Summary statistics
                    - Key linguistic insights
        """
        
        # Invoke Bedrock model
        if self.model is None:
            return "Error: Model not initialized"
        
        try:
            response = self.model.invoke(prompt)
            return response.content if hasattr(response, 'content') else str(response)
        except Exception as e:
            return f"Error during AI analysis: {str(e)}"
    
    async def _arun(self, *args, **kwargs):
        """Async version not implemented"""
        raise NotImplementedError("Async execution not supported")


### Initialize KWIC Analysis Tool with Bedrock Model

In [42]:
# Create the KWIC Analysis Tool instance
kwic_tool = KWICAnalysisTool()
kwic_tool.model = model  # Attach the Bedrock model we initialized earlier

print(f"‚úÖ KWIC Analysis Tool initialized")
print(f"üìä Tool name: {kwic_tool.name}")
print(f"üìù Description: {kwic_tool.description}")

‚úÖ KWIC Analysis Tool initialized
üìä Tool name: kwic_analysis
üìù Description: Analyzes KWIC (Key Word in Context) concordance data from corpus linguistics.
    Provides AI-assisted classification and analysis of concordance lines.


### Run KWIC Analysis with Concordance Data

- optionally drop the classifications and notes so AI is just working with concordance hits

In [43]:
# Create a clean copy of data without classification and notes
import copy
clean_data = copy.deepcopy(data)

# Remove classification and notes from all annotations
if 'annotations' in clean_data:
    for genre_id, items_dict in clean_data['annotations'].items():
        for idx_str, item in items_dict.items():
            # Remove classification and notes fields if they exist
            item.pop('classification', None)
            item.pop('notes', None)

In [44]:
# Execute the KWIC analysis by passing the cleaned concordance data
result = kwic_tool._run(
    reasoning_level=reasoning_level,
    corpus_fullname=corpus_fullname,
    corpus_shortname=corpus_shortname,
    keyword=keyword,
    #classifications=classifications,
    random_KWIC_sample=50,
    kwic_data=clean_data  # Pass the cleaned JSON data
)

print("=" * 80)
print("KWIC ANALYSIS RESULTS")
print("=" * 80)
print(result)

KWIC ANALYSIS RESULTS
Error during AI analysis: Read timeout on endpoint URL: "https://bedrock-runtime.us-east-1.amazonaws.com/model/openai.gpt-oss-120b-1%3A0/converse"


### Optional: Save Analysis Results

In [34]:
# Export to a Markdown Report
markdown_output_file = f"./kwic_ai_analysis_{corpus_shortname}_{keyword.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"

# Initialize variables
reasoning_content = ""
markdown_content = ""

# Extract content from the result
if isinstance(result, list):
    # Parse the response structure
    for item in result:
        if item.get('type') == 'reasoning_content':
            reasoning_data = item.get('reasoning_content', {})
            reasoning_content = reasoning_data.get('text', '')
        elif item.get('type') == 'text':
            markdown_content = item.get('text', '')
else:
    markdown_content = str(result)

# Create the full markdown report
full_report = f"""# KWIC Analysis Report: {keyword}

**Corpus:** {corpus_fullname} ({corpus_shortname})  
**Keyword/Phrase:** "{keyword}"  
**Reasoning Level:** {reasoning_level}  
**Random Sample Size:** {random_KWIC_sample}
**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  
**Generated by:** AWS Bedrock AI Model ({model_id})

**Reasoning Content:**
```text
{reasoning_content}
```

---

{markdown_content}

---

**Report Metadata:**
- Total concordance lines analyzed: {len(df_flat)}
- Source data file: `{path}`
- Analysis tool: KWICAnalysisTool (LangChain + AWS Bedrock)
"""

# Write to file
with open(markdown_output_file, 'w', encoding='utf-8') as f:
    f.write(full_report)

print(f"‚úÖ Markdown report exported to: {markdown_output_file}")
print(f"üìÑ Report contains {len(markdown_content)} characters")
print(f"üìä Based on {len(df_flat)} concordance lines")

‚úÖ Markdown report exported to: ./kwic_ai_analysis_COCA_national_system_20251122_174138.md
üìÑ Report contains 21022 characters
üìä Based on 51 concordance lines
