In [7]:
# !pip install mistralai

In [8]:
from dotenv import load_dotenv

load_dotenv()

True

In [35]:
import os
from mistralai import Mistral

def create_chat():
    api_key = os.environ["MISTRAL_API_KEY"]
    client = Mistral(api_key=api_key)

    model = "mistral-large-latest"
    messages = [
        {"role": "user", "content": "What is the capital of France?"}
    ]

    response = client.chat.complete(
        model=model,
        messages=messages,
        temperature=0.0
    )

    # The response contains “choices” with messages
    answer = response.choices[0].message.content
    print("Answer:", answer)


def create_embeddings(content):
    api_key = os.environ["MISTRAL_API_KEY"]
    client = Mistral(api_key=api_key)

    # Use the embeddings model (e.g. “mistral-embed”) 
    model = "mistral-embed"
    inputs = content.splitlines()
    inputs = [input_ for input_ in inputs if input_ != ""]

    batch = []
    embeddings = []
    batch_size = 0
    max_batch_size = 8000

    idx = 0
    while idx < len(inputs):
        while idx < len(inputs) and len(inputs[idx]) + batch_size <= max_batch_size:
            batch.append(inputs[idx])
            batch_size += len(inputs[idx])
            idx += 1

        if len(batch) != 0:
            resp = client.embeddings.create(model=model, inputs=batch)
            embeddings.append(resp)

            # reset batch
            batch = []
            batch_size = 0
    
    return embeddings

create_chat()
create_embeddings("Hi\nHow are you?")

Answer: The capital of France is **Paris**.
0 2
0
1


[EmbeddingResponse(id='519b82084e224f1dad979a7222bc5ab4', object='list', model='mistral-embed', usage=UsageInfo(prompt_tokens=9, completion_tokens=0, total_tokens=9, prompt_audio_seconds=None, request_count=None, prompt_token_details=None), data=[EmbeddingResponseData(object='embedding', embedding=[-0.01568603515625, -0.0247802734375, 0.049041748046875, 0.0208282470703125, 0.0447998046875, 0.02978515625, 0.036376953125, 0.03399658203125, -0.017791748046875, 0.0021419525146484375, -0.059844970703125, 0.0245208740234375, -0.0200347900390625, -0.0200347900390625, -0.039794921875, 0.006885528564453125, -0.03228759765625, 0.04400634765625, 0.008636474609375, 0.022674560546875, -0.01107025146484375, 0.004547119140625, -0.0263671875, 0.0154876708984375, -0.004482269287109375, -0.0019197463989257812, -0.0010423660278320312, -0.0295257568359375, -0.01337432861328125, -0.022674560546875, 0.005138397216796875, -0.03240966796875, 0.002536773681640625, 0.0098876953125, 0.04638671875, -0.00160598754

In [None]:
from glob import glob

embeddings = []
for path in glob("./ShikshaReMastered/app/src/main/java/com/example/shiksharemastered/*.java"):
    with open(path, mode="r") as finput:
        print("Creating for file:", path)
        content = finput.read()
        embedding = create_embeddings(content)
        embeddings.append(embedding)

In [14]:
import os
import re
from glob import glob
from mistralai import Mistral

# Keep the conversation history in a list
messages = [
    {
        "role": "user",
        "content": """You are a professional software security analyst. Given a unit of code, identify all potential vulnerabilities, security risks, or bad practices present. For each issue, provide:
                1. The type of vulnerability or risk (e.g., SQL injection, XSS, insecure API usage, buffer overflow, etc.).
                2. A brief explanation of why it is a risk.
                3. The exact location in the code (line number or code snippet).
                If you do not find any vulnerabilities in the unit, explicitly write:
                    "No vulnerabilities in this unit."
                Respond in a clear, structured format that can be easily read and referenced."""
    }
]

api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)
model = "mistral-large-2411"
response = client.chat.complete(model=model, messages=messages)
answer = response.choices[0].message.content
messages.append({"role": "assistant", "content": answer})

In [15]:
def do_chat(client, model, content):
    messages.append({"role": "user", "content": f"Find vulnerabilities in the code unit below:\n{content}"})
    response = client.chat.complete(model=model, messages=messages)
    answer = response.choices[0].message.content
    messages.append({"role": "assistant", "content": answer})
    return answer
    


def split_java_sections(code_lines):
    sections = []
    buffer = []
    brace_count = 0
    in_block = False

    # Combine lines to preserve formatting
    lines = code_lines[:]

    for line in lines:
        stripped = line.strip()

        # Always include package/import lines as separate units
        if re.match(r"^package\s+[\w.]+;|^import(\s+static)?\s+[\w.]+(\.[\w*]+)?;", stripped):
            sections.append(stripped)
            continue

        # Detect start of a class or method block
        if re.match(r"^public\s+class\s+\w+.*\{", stripped) or \
           re.match(r"^(public|private|protected)?\s*[\w<>\[\]]+\s+\w+\s*\([^)]*\)\s*\{", stripped) or \
           re.match(r"^\s*@\w+", stripped):  # annotation
            buffer.append(line)
            brace_count += line.count("{") - line.count("}")
            in_block = True
            continue

        # If inside a block, accumulate lines
        if in_block:
            buffer.append(line)
            brace_count += line.count("{") - line.count("}")
            if brace_count == 0:
                sections.append("\n".join(buffer).strip())
                buffer = []
                in_block = False
            continue

        # Otherwise, treat as standalone (fields, etc.)
        if stripped:
            sections.append(stripped)

    idx = 0
    while idx < len(sections):
        section = sections[idx]
        statement_count = sum(1 for line in section.splitlines() if line.strip() and not line.strip().startswith('//'))
        if statement_count < 3 and idx != 0:
            sections[idx - 1] += "\n" + sections.pop(idx)
        else:
            idx += 1

    return sections

In [16]:
for path in glob("./ShikshaReMastered/app/src/main/java/com/example/shiksharemastered/*.java"):
    with open(path, mode="r") as finput:
        print("Reading file:", path)
        content = finput.read()
        lines = content.splitlines()
        lines = [line for line in lines if line != '']
        units = split_java_sections(lines)

        with open(os.path.join('outputs', f"{os.path.basename(path).split('.')[0]}.txt"), mode='w+', encoding='utf-8') as foutput:
            for unit in units:
                # print(unit)
                answer = do_chat(client, model, unit)
                foutput.write(unit + "\n" + answer)
                foutput.write("\n=====================================================================================\n")

Reading file: ./ShikshaReMastered/app/src/main/java/com/example/shiksharemastered\ConsentDetailedView.java
Reading file: ./ShikshaReMastered/app/src/main/java/com/example/shiksharemastered\ConsentRecyclerViewAdapter.java


KeyboardInterrupt: 