In [2]:
# DeepTrace-AI
# AI-Powered Secret Leak Detector for GitHub Repositories

import requests
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# Load AI Model (CodeBERT)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

USE_AI = True


# Secret Detection Functions

def is_sensitive_ai(line):
    """Uses AI model to detect if a line is sensitive"""
    inputs = tokenizer(line, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return torch.argmax(outputs.logits).item() == 1

regex_patterns = {
    "AWS Access Key": r"AKIA[0-9A-Z]{16}",
    "Google API Key": r"AIza[0-9A-Za-z-_]{35}",
    "Generic Password": r"(?i)password\s*=\s*.*",
    "JWT Secret": r"(?i)jwt[_\-]?secret\s*=\s*.*",
    "Stripe Secret Key": r"sk_live_[0-9a-zA-Z]{24,}",
    "Stripe Test Key": r"sk_test_[0-9a-zA-Z]{24,}",
    "Generic API Key": r"(?i)api[_\-]?key\s*=\s*.*",
    "Bearer Token": r"Bearer\s+[A-Za-z0-9\-\._~\+\/]+=*",
    "Database URL": r"(?i)database[_\-]?url\s*=\s*.*",
    "Private Key": r"-----BEGIN PRIVATE KEY-----",
}

def is_sensitive_regex(line):
    for key, pattern in regex_patterns.items():
        if re.search(pattern, line):
            return key
    return None


# Scan Raw GitHub File

def scan_file_from_url(file_url, file_name):
    print(f"\nScanning {file_name}...\n")
    try:
        response = requests.get(file_url)
        if response.status_code == 404:
            print(f"File not found (404): {file_url}")
            return
        lines = response.text.splitlines()
        print("DEBUG: Raw file contents:")
        for i, line in enumerate(lines):
            print(f"{i+1}: {line}")
    except:
        print("Failed to fetch file.")
        return

    found_any = False
    for i, line in enumerate(lines):
        regex_result = is_sensitive_regex(line)
        if regex_result:
            print(f"Error [Regex] {regex_result} in {file_name}, line {i+1}: {line}")
            found_any = True
        elif USE_AI and is_sensitive_ai(line):
            print(f"Error [AI] Sensitive line in {file_name}, line {i+1}: {line}")
            found_any = True

    if not found_any:
        print("No sensitive lines detected.")

# Scan GitHub Repo (Recursive)

def scan_github_repo(owner, repo, path=""):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
    response = requests.get(url)
    if response.status_code != 200:
        print("Error accessing repo or folder.")
        return

    contents = response.json()
    for item in contents:
        if item['type'] == 'file':
            scan_file_from_url(item['download_url'], path + item['name'])
        elif item['type'] == 'dir':
            scan_github_repo(owner, repo, path + item['name'] + '/')

# Summary Report

scan_results = {
    "files_scanned": 0,
    "issues_found": 0,
    "secure_files": 0
}

# Wrapper to collect stats
original_scan_file_from_url = scan_file_from_url

def scan_file_with_summary(file_url, file_name):
    scan_results["files_scanned"] += 1
    found_any = False
    print(f"\nScanning {file_name}...\n")
    try:
        response = requests.get(file_url)
        if response.status_code == 404:
            print(f"File not found (404): {file_url}")
            return
        lines = response.text.splitlines()
        print("DEBUG: Raw file contents:")
        for i, line in enumerate(lines):
            print(f"{i+1}: {line}")
    except:
        print("Failed to fetch file.")
        return

    for i, line in enumerate(lines):
        regex_result = is_sensitive_regex(line)
        if regex_result:
            print(f"Error [Regex] {regex_result} in {file_name}, line {i+1}: {line}")
            scan_results["issues_found"] += 1
            found_any = True
        elif USE_AI and is_sensitive_ai(line):
            print(f"Error [AI] Sensitive line in {file_name}, line {i+1}: {line}")
            scan_results["issues_found"] += 1
            found_any = True

    if not found_any:
        print("No sensitive lines detected.")
        scan_results["secure_files"] += 1

# ------------------------------
# Step 6: CLI Input for User-Friendly Mode
# ------------------------------
def main():
    print("Welcome to DeepTrace-AI Scanner\n")
    print("Choose scan option:")
    print("1. Scan default .env.example (Laravel)")
    print("2. Scan GitHub repository")
    print("3. Scan raw file URL")
    choice = input("Enter choice (1/2/3): ").strip()

    global scan_file_from_url
    scan_file_from_url = scan_file_with_summary

    if choice == "1":
        laravel_env_url = "https://raw.githubusercontent.com/laravel/laravel/master/.env.example"
        scan_file_from_url(laravel_env_url, ".env.example")

    elif choice == "2":
        owner = input("Enter GitHub owner (e.g., laravel): ").strip()
        repo = input("Enter repository name (e.g., laravel): ").strip()
        scan_github_repo(owner, repo)

    elif choice == "3":
        file_url = input("Enter raw file URL: ").strip()
        file_name = file_url.split("/")[-1]
        scan_file_from_url(file_url, file_name)

    else:
        print("Invalid choice. Please enter 1, 2, or 3.")

    print("\nSummary Report:")
    print(f"Files Scanned: {scan_results['files_scanned']}")
    print(f"Issues Found: {scan_results['issues_found']}")
    print(f"Secure Files: {scan_results['secure_files']}")
    print("\nScan Complete.")

# Run the CLI App

if __name__ == "__main__":
    main()



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Welcome to DeepTrace-AI Scanner

Choose scan option:
1. Scan default .env.example (Laravel)
2. Scan GitHub repository
3. Scan raw file URL


Enter choice (1/2/3):  1



Scanning .env.example...

DEBUG: Raw file contents:
1: APP_NAME=Laravel
2: APP_ENV=local
3: APP_KEY=
4: APP_DEBUG=true
5: APP_URL=http://localhost
6: 
7: APP_LOCALE=en
8: APP_FALLBACK_LOCALE=en
9: APP_FAKER_LOCALE=en_US
10: 
11: APP_MAINTENANCE_DRIVER=file
12: # APP_MAINTENANCE_STORE=database
13: 
14: PHP_CLI_SERVER_WORKERS=4
15: 
16: BCRYPT_ROUNDS=12
17: 
18: LOG_CHANNEL=stack
19: LOG_STACK=single
20: LOG_DEPRECATIONS_CHANNEL=null
21: LOG_LEVEL=debug
22: 
23: DB_CONNECTION=sqlite
24: # DB_HOST=127.0.0.1
25: # DB_PORT=3306
26: # DB_DATABASE=laravel
27: # DB_USERNAME=root
28: # DB_PASSWORD=
29: 
30: SESSION_DRIVER=database
31: SESSION_LIFETIME=120
32: SESSION_ENCRYPT=false
33: SESSION_PATH=/
34: SESSION_DOMAIN=null
35: 
36: BROADCAST_CONNECTION=log
37: FILESYSTEM_DISK=local
38: QUEUE_CONNECTION=database
39: 
40: CACHE_STORE=database
41: # CACHE_PREFIX=
42: 
43: MEMCACHED_HOST=127.0.0.1
44: 
45: REDIS_CLIENT=phpredis
46: REDIS_HOST=127.0.0.1
47: REDIS_PASSWORD=null
48: REDIS_PORT=6379
4