In [1]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langgraph.graph import END, MessageGraph
import os
import git
import json
import re




def filter_important_files(file_paths):
    important_extensions = [
    '.php'
]

    important_files = []
    for file_path in file_paths:
        if any(file_path.endswith(ext) for ext in important_extensions):
            important_files.append(file_path)
    
    return important_files


def clone_repo(repo_url, clone_dir):
    if not os.path.exists(clone_dir):
        os.makedirs(clone_dir)
    repo = git.Repo.clone_from(repo_url, clone_dir)
    return repo.working_tree_dir



def list_files(directory):
    file_list=[]
    for root, dirs, files, in os.walk(directory):
        for file in files:
            if "." in file[0]:
                continue
            else:
                file_list.append(os.path.relpath(os.path.join(root,file,), directory))
    return file_list


def save_to_file(directory, file_name, content):
    """
    Function to save content to a file in the specified directory with a specific name.
    """   
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = os.path.join(directory, file_name)
    with open(file_path, 'a') as file:
        file.write(content)
    return file_path

def read_file(file_path):
    
    try:
        with open(file_path, 'r') as file:
            content = file.read()
            return content
    except FileNotFoundError:
        print("File not found.")
        return None
    

def add_line_numbers(input_file, output_file):
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        line_number = 1
        for line in f_in:
            f_out.write(f"{line_number}:: {line}")
            line_number += 1



    
summarizer = """ 

Task: Analyzing GitHub Repository Files for Web Vulnerabilities

Description: Your task is to meticulously analyze GitHub repository files for potential web vulnerabilities that could compromise security. Specifically, focus on identifying vulnerabilities such as SQL injection (SQLi), cross-site scripting (XSS), and other common web attack vectors. Pinpoint any code segments or patterns that pose a risk to software security.

The provided file content has file number written infront of each line in this formate 1:: 2:: 3::, you should consider it when outputing line numbers, don't provide empty line number.

Output Format:
For each identified web vulnerability, provide the following details:

Vulnerability Type: Specify the type of vulnerability found (e.g., SQL injection, XSS).
Location: Full file path and line numbers where the vulnerability is present.
Code Context: Include a few lines of code surrounding the vulnerable segment to provide context. Only include the line numbers where the vulnerability exists.
Solution: Provide a solution to mitigate the vulnerability. Modify the specific line(s) of code causing the issue, ensuring to include the full solution.
Ensure thorough analysis to accurately identify and address potential web vulnerabilities, fortifying the GitHub repository against security threats.

Output Formate only json for text and markup for code:

vulnerable_code format '''php '''
submit_code format '''php '''
the code should be in markup format
Just stick to formate no extra text here is the code or I find this or that

{
    "vulnerability_type":"vulnerability_type",
    "any_vulnerability_found":"true/false",
    "location":"full_file_path",
    "line_number":[line_number],
    "vulnerable_code":"vulnerable_code",
    "vulnerability_description":"vulnerability_description",
    "solution_code:"solution_code",
    "solution_description":"solution_description"

}


"""





directory = os.path.abspath(os.getcwd()) 
file_name = "summary.txt"
model = Ollama(model="mistral", 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
             verbose=False,
             temperature=0)

graph = MessageGraph()

graph.add_node("oracle", model)
graph.add_edge("oracle", END)

graph.set_entry_point("oracle")

runnable = graph.compile()


github_url = "https://github.com/appsecco/sqlinjection-training-app.git"
match = re.search(r"\/([^\/]+)\/?$", github_url)
clone_file_name = match.group(1).replace(".git","")
repo_path = clone_repo(github_url, clone_file_name)
files = list_files(repo_path)

print(files)

file_list = filter_important_files(files)

print(file_list)
print(len(file_list))




for files in file_list:
    
    file_path = os.path.abspath(os.getcwd())+"/"+clone_file_name+"/"+files
    print(file_path)
    
    output_file = "output.txt"
    add_line_numbers(file_path, output_file)

    file_content = read_file('output.txt')
    
    finalize = (file_content + f"file_name:{files} " + summarizer)
    result = runnable.invoke(finalize)
    string_message = str(result[1].content)
    output_message = f"""{string_message}"""

    output_directory = directory+"/"+clone_file_name+"_analysis"
    match = re.search(r"/([^/]+)$", files)

    if match:
        last_part = match.group(1)
        last_part = last_part+".txt"
        print(last_part)  # Output: secondorder_changepass.php
    else:
        print("No match found")

    save_to_file(output_directory, last_part, output_message)
    



['LICENSE', 'Dockerfile', 'README.md', 'docker-compose.yml', 'walkthrough.md', 'www/sqlitraining.sql', 'www/secondorder_changepass.php', 'www/favicon.ico', 'www/index.php', 'www/register.php', 'www/logout.php', 'www/blindsqli.php', 'www/login2.php', 'www/login1.php', 'www/resetdb.php', 'www/db_config.php', 'www/secondorder_home.php', 'www/secondorder_register.php', 'www/robots.txt', 'www/searchproducts.php', 'www/os_sqli.php', 'www/css/htmlstyles.css', 'udf/lib_mysqludf_sys.so', 'udf/udf.hex', '.git/config', '.git/HEAD', '.git/description', '.git/index', '.git/packed-refs', '.git/objects/pack/pack-30d492ff9dd7c7cce37c90f2b099323264a20ea6.rev', '.git/objects/pack/pack-30d492ff9dd7c7cce37c90f2b099323264a20ea6.idx', '.git/objects/pack/pack-30d492ff9dd7c7cce37c90f2b099323264a20ea6.pack', '.git/info/exclude', '.git/logs/HEAD', '.git/logs/refs/heads/master', '.git/logs/refs/remotes/origin/HEAD', '.git/hooks/commit-msg.sample', '.git/hooks/pre-rebase.sample', '.git/hooks/sendemail-validate.sa

KeyboardInterrupt: 

In [1]:
def add_line_numbers(input_file, output_file):
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        line_number = 1
        for line in f_in:
            f_out.write(f"{line_number}: {line}")
            line_number += 1

input_file = "websocket.py"
output_file = "output.txt"
h = add_line_numbers(input_file, output_file)