# Java EE Project Analyzer with Groq API
This notebook reads a Java EE project directory, processes files using LangChain, and analyzes them using Groq's model.

INSTRUCTIONS:<br>
1. Quick Run of Java code analysis:<br>
   - Fill PROJECT_PATH and GROQ_API_KEY in function run_with_predefined_paths() <br>
   - **run_with_predefined_paths()** <br>
2. The analysis may take several minutes for large projects due to API rate limiting.<br>
3. Results will be saved to 'java_ee_analysis.json'<br>


In [77]:
!pip install langchain langchain-community langchain-groq python-magic-bin chardet groq

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [79]:
import os
import json
import re
import magic
from pathlib import Path
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_groq import ChatGroq
import chardet
import time

In [80]:
class RateLimiter:
    def __init__(self, calls_per_second=2):
        self.calls_per_second = calls_per_second
        self.last_call_time = 0
        
    def wait_if_needed(self):
        current_time = time.time()
        time_since_last_call = current_time - self.last_call_time
        min_interval = 1.0 / self.calls_per_second
        
        if time_since_last_call < min_interval:
            time.sleep(min_interval - time_since_last_call)
        
        self.last_call_time = time.time()

In [81]:
class JavaEEProjectAnalyzer:
    def __init__(self, project_path: str, groq_api_key: str = None):
        self.project_path = Path(project_path)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )
        
        # Initialize Groq client
        self.groq_api_key = groq_api_key or os.getenv('GROQ_API_KEY')
        if not self.groq_api_key:
            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable or pass it to constructor")
        
        self.llm = ChatGroq(
            groq_api_key=self.groq_api_key,
        #    model_name="deepseek-r1-distill-llama-70b",
            model_name="meta-llama/llama-4-maverick-17b-128e-instruct",
            temperature=0.1,
            max_tokens=4000
        )
        
        self.rate_limiter = RateLimiter(calls_per_second=1.5)
    
    def detect_encoding(self, file_path: str) -> str:
        """Detect file encoding"""
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            return result['encoding'] or 'utf-8'
    
    def read_java_files(self) -> List[Document]:
        """Read all Java files from the project directory"""
        java_files = []
        
        for root, _, files in os.walk(self.project_path):
            for file in files:
                if file.endswith('.java'):
                    file_path = Path(root) / file
                    try:
                        encoding = self.detect_encoding(file_path)
                        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
                            content = f.read()
                        
                        java_files.append(Document(
                            page_content=content,
                            metadata={
                                'file_path': str(file_path),
                                'file_name': file,
                                'relative_path': str(file_path.relative_to(self.project_path))
                            }
                        ))
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
        
        return java_files
    
    def categorize_class(self, content: str, file_path: str) -> str:
        """Categorize Java class based on naming conventions and annotations"""
        filename = Path(file_path).stem.lower()
        
        # Check for common annotations
        annotations = {
            '@Controller': 'Controller',
            '@RestController': 'Controller',
            '@Service': 'Service',
            '@Repository': 'DAO',
            '@Entity': 'Entity',
            '@Component': 'Component',
            '@Configuration': 'Configuration',
            '@Bean': 'Configuration'
        }
        
        for annotation, category in annotations.items():
            if annotation in content:
                return category
        
        # Check filename patterns
        if any(pattern in filename for pattern in ['controller', 'ctrl']):
            return 'Controller'
        elif any(pattern in filename for pattern in ['service', 'impl']):
            return 'Service'
        elif any(pattern in filename for pattern in ['repository', 'dao', 'data']):
            return 'DAO'
        elif any(pattern in filename for pattern in ['entity', 'model', 'dto']):
            return 'Entity'
        elif any(pattern in filename for pattern in ['config', 'configuration']):
            return 'Configuration'
        elif any(pattern in filename for pattern in ['util', 'helper', 'utility']):
            return 'Utility'
        
        return 'Other'
    
    def extract_class_info(self, content: str) -> Dict[str, Any]:
        """Extract class name and basic information"""
        class_pattern = r'public\s+(?:class|interface|enum|record)\s+(\w+)'
        class_match = re.search(class_pattern, content)
        class_name = class_match.group(1) if class_match else "Unknown"
        
        return {
            'name': class_name,
            'content': content
        }
    
    def extract_methods(self, content: str) -> List[Dict[str, str]]:
        """Extract methods from Java class content"""
        methods = []
        
        # Pattern to match method signatures
        method_pattern = r'(@?\w+\s+)*((public|private|protected)\s+)?(\w+(?:<\w+>)?\s+\w+\([^)]*\))\s*(?:throws\s+\w+(?:\s*,\s*\w+)*)?\s*\{'
        
        matches = re.finditer(method_pattern, content, re.MULTILINE)
        
        for match in matches:
            full_match = match.group(0)
            method_signature = match.group(4)
            
            # Clean up the signature
            method_signature = re.sub(r'\s+', ' ', method_signature).strip()
            
            # Extract method name
            method_name_match = re.search(r'(\w+)\(', method_signature)
            method_name = method_name_match.group(1) if method_name_match else "Unknown"
            
            methods.append({
                'name': method_name,
                'signature': method_signature,
                'description': '',
                'complexity': ''
            })
        
        return methods
    
    def analyze_with_llm_actual(self, chunks: List[Document]) -> Dict[str, Any]:
        """Analyze code chunks using Groq API with meta-llama/llama-4-maverick-17b-128e-instruct"""
        result = {
            "projectOverview": "",
            "Class": []
        }
        
        print("Getting project overview with LLM...")
        # Get project overview from first few chunks
        overview_content = "\n".join([chunk.page_content[:500] for chunk in chunks[:5]])
        
        overview_prompt = f"""
        Analyze this Java EE project code and provide a comprehensive high-level overview of its business purpose, 
        main functionalities, and architecture. Be specific about what the project does.
        
        Sample code snippets:
        {overview_content}
        
        Provide a concise but detailed project overview in 3-4 sentences.
        """
        
        try:
            self.rate_limiter.wait_if_needed()
            overview_response = self.llm.invoke(overview_prompt)
            result["projectOverview"] = overview_response.content.strip()
            print("Project overview generated")
        except Exception as e:
            print(f"Error getting project overview: {e}")
            result["projectOverview"] = "Java EE application with layered architecture"
        
        print("Analyzing individual classes with LLM...")
        # Analyze each class
        for i, chunk in enumerate(chunks):
            print(f"Analyzing class {i+1}/{len(chunks)}...")
            
            class_info = self.extract_class_info(chunk.page_content)
            category = self.categorize_class(chunk.page_content, chunk.metadata['file_path'])
            methods = self.extract_methods(chunk.page_content)
            
            # Skip if no class name found or too many methods (likely not a class file)
            if class_info['name'] == "Unknown" or len(methods) > 20:
                continue
            
            # Prepare analysis prompt for this class
            analysis_prompt = f"""
            Analyze this Java class and provide detailed information in JSON format.
            
            Class Name: {class_info['name']}
            Category: {category}
            File: {chunk.metadata['file_name']}
            
            Class Content:
            {chunk.page_content}
            
            Requirements:
            1. Provide a comprehensive description of the class purpose and responsibilities
            2. For each method in the class, provide:
               - A detailed description of what the method does
               - Complexity estimate (Low/Medium/High/Very High) based on logic, dependencies, and operations
            3. Return ONLY valid JSON in this exact format:
            {{
                "class_description": "detailed description here",
                "methods": [
                    {{
                        "name": "methodName",
                        "description": "detailed method description",
                        "complexity": "Low/Medium/High/Very High"
                    }}
                ]
            }}
            
            Important: Return ONLY JSON, no additional text or explanations.
            """
            
            try:
                self.rate_limiter.wait_if_needed()
                analysis_response = self.llm.invoke(analysis_prompt)
                analysis_text = analysis_response.content.strip()
                
                # Try to extract JSON from the response
                json_match = re.search(r'\{.*\}', analysis_text, re.DOTALL)
                if json_match:
                    analysis_data = json.loads(json_match.group())
                    
                    # Map the analyzed methods to our extracted methods
                    analyzed_methods = []
                    for extracted_method in methods:
                        # Find matching analyzed method
                        analyzed_method = next(
                            (am for am in analysis_data.get('methods', []) 
                             if am['name'] == extracted_method['name']),
                            None
                        )
                        
                        if analyzed_method:
                            analyzed_methods.append({
                                "name": extracted_method['name'],
                                "signature": extracted_method['signature'],
                                "description": analyzed_method['description'],
                                "complexity": analyzed_method['complexity']
                            })
                        else:
                            # Fallback if LLM didn't analyze this method
                            analyzed_methods.append({
                                "name": extracted_method['name'],
                                "signature": extracted_method['signature'],
                                "description": f"Method {extracted_method['name']} implementation",
                                "complexity": "Medium"
                            })
                    
                    # Add class to result
                    class_data = {
                        "name": class_info['name'],
                        "Category": category,
                        "description": analysis_data.get('class_description', 
                                                       f"{category} class {class_info['name']}"),
                        "methods": analyzed_methods
                    }
                    
                    result["Class"].append(class_data)
                    print(f"Analyzed {class_info['name']} ({category})")
                    
                else:
                    print(f"Could not parse JSON from LLM response for {class_info['name']}")
                    # Fallback for this class
                    self.add_fallback_class(result, class_info, category, methods)
                    
            except json.JSONDecodeError as e:
                print(f"JSON parsing error for {class_info['name']}: {e}")
                self.add_fallback_class(result, class_info, category, methods)
            except Exception as e:
                print(f"Error analyzing class {class_info['name']}: {e}")
                self.add_fallback_class(result, class_info, category, methods)
        
        return result
    
    def add_fallback_class(self, result: Dict[str, Any], class_info: Dict[str, Any], 
                          category: str, methods: List[Dict[str, str]]):
        """Add fallback class data when LLM analysis fails"""
        methods_with_fallback = []
        for method in methods:
            methods_with_fallback.append({
                "name": method['name'],
                "signature": method['signature'],
                "description": f"Method {method['name']} implementation",
                "complexity": "Medium"
            })
        
        class_data = {
            "name": class_info['name'],
            "Category": category,
            "description": f"{category} class {class_info['name']}",
            "methods": methods_with_fallback
        }
        result["Class"].append(class_data)
        print(f"âœ“ Added fallback for {class_info['name']}")
    
    def analyze_project(self) -> Dict[str, Any]:
        """Main analysis method with Groq LLM"""
        print("Reading Java files...")
        java_files = self.read_java_files()
        
        if not java_files:
            raise ValueError("No Java files found in the specified directory")
        
        print(f"Found {len(java_files)} Java files")
        
        print("Chunking files...")
        all_chunks = []
        for doc in java_files:
            chunks = self.text_splitter.split_documents([doc])
            all_chunks.extend(chunks)
        
        print(f"Created {len(all_chunks)} chunks")
        
        print("Analyzing with Groq LLM...")
        analysis_result = self.analyze_with_llm_actual(all_chunks)
        
        return analysis_result

In [82]:
def test_groq_connection(api_key: str) -> bool:
    """Test if Groq API is working"""
    try:
        llm = ChatGroq(
            groq_api_key=api_key,
#            model_name="deepseek-r1-distill-llama-70b",
             model_name="meta-llama/llama-4-maverick-17b-128e-instruct",

            temperature=0.1
        )
        response = llm.invoke("Hello, are you working? Respond with 'OK' if ready.")
        print("âœ“ Groq connection successful!")
        print(f"Response: {response.content}")
        return True
    except Exception as e:
        print(f"Groq connection failed: {e}")
        return False

In [83]:
def main():
    # Configuration
    PROJECT_PATH = input("Enter the path to your Java EE project: ").strip()
    GROQ_API_KEY = input("Enter your Groq API key (or press Enter to use environment variable): ").strip()
    
    if not GROQ_API_KEY:
        GROQ_API_KEY = os.getenv('GROQ_API_KEY')
    
    if not GROQ_API_KEY:
        print("Error: Groq API key is required")
        return
    
    # Test connection first
    print("Testing Groq connection...")
    if not test_groq_connection(GROQ_API_KEY):
        return
    
    # Initialize analyzer with Groq
    analyzer = JavaEEProjectAnalyzer(PROJECT_PATH, GROQ_API_KEY)
    
    # Analyze project
    try:
        print("\nStarting project analysis...")
        result = analyzer.analyze_project()
        
        # Save results to JSON file
        output_file = "java_ee_analysis_groq.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        
        print(f"\nAnalysis complete! Results saved to {output_file}")
        
        # Print summary
        print(f"\nProject Overview: {result['projectOverview']}")
        print(f"Total classes analyzed: {len(result['Class'])}")
        
        # Count by category
        categories = {}
        for cls in result['Class']:
            categories[cls['Category']] = categories.get(cls['Category'], 0) + 1
        
        print("\nClasses by category:")
        for category, count in categories.items():
            print(f"  {category}: {count}")
        
        # Show sample of results
        if result['Class']:
            print(f"\nSample class analysis:")
            sample_class = result['Class'][0]
            print(f"Class: {sample_class['name']} ({sample_class['Category']})")
            print(f"Description: {sample_class['description']}")
            if sample_class['methods']:
                sample_method = sample_class['methods'][0]
                print(f"Sample method: {sample_method['name']} - {sample_method['complexity']} complexity")
            
    except Exception as e:
        print(f"Error during analysis: {e}")

In [84]:
def run_with_predefined_paths():
    """Run analysis with predefined paths for testing"""
    PROJECT_PATH = r"C:\Users\mrash\Downloads\simple-springboot-app-master"  # Change this
    GROQ_API_KEY = "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"  # Change this or set environment variable
    
    if not os.path.exists(PROJECT_PATH):
        print(f"Project path does not exist: {PROJECT_PATH}")
        return
    
    analyzer = JavaEEProjectAnalyzer(PROJECT_PATH, GROQ_API_KEY)
    
    try:
        result = analyzer.analyze_project()
        
        output_file = "java_ee_analysis.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        
        print(f"Analysis saved to {output_file}")
        
    except Exception as e:
        print(f"Error: {e}")

In [85]:
run_with_predefined_paths()

Reading Java files...
Found 20 Java files
Chunking files...
Created 21 chunks
Analyzing with Groq LLM...
Getting project overview with LLM...
Project overview generated
Analyzing individual classes with LLM...
Analyzing class 1/21...
Analyzed StudentApplication (Other)
Analyzing class 2/21...
Analyzed StudentConfig (Configuration)
Analyzing class 3/21...
Analyzed AutowiredController (Controller)
Analyzing class 4/21...
Analyzing class 5/21...
Analyzed ConfigurationPropertyController (Controller)
Analyzing class 6/21...
Analyzed StudentController (Controller)
Analyzing class 7/21...
Analyzed ValuePropertyController (Controller)
Analyzing class 8/21...
Analyzed InvalidFieldException (Other)
Analyzing class 9/21...
Analyzed InvalidHeaderFieldException (Other)
Analyzing class 10/21...
Analyzed StudentExceptionHandler (Controller)
Analyzing class 11/21...
Analyzed RequestHeaderInterceptor (Component)
Analyzing class 12/21...
Analyzed Student (Other)
Analyzing class 13/21...
Analyzing class 