# Python

In [30]:
# !pip install ast-comments
# %%writefile index.py
import ast_comments
#import ast

class SensitiveDataVisitor(ast_comments.NodeVisitor):
    def __init__(self, sensitive_strings) -> None:
        super().__init__()
        self.sensitive_strings = sensitive_strings
    def visit_Assign(self, node):
        for target in node.targets:
            # check variable assignment
            if isinstance(target, ast_comments.Name):
                for sensitive_string in self.sensitive_strings:
                    if sensitive_string in target.id.lower():
                        for value in ast_comments.walk(node.value):
                            if isinstance(value, ast_comments.Str):
                                print(f"Possible sensitive data found on line {value.lineno}: with {target.id} = {value.s}")
                                value.s = "FILTERED"
                                
            # check attribute assignment
            elif isinstance(target, ast_comments.Attribute):
                for sensitive_string in self.sensitive_strings:
                    if sensitive_string in target.attr.lower():
                        for value in ast_comments.walk(node.value):
                            if isinstance(value, ast_comments.Str):
                                print(f"Possible sensitive data found on line {value.lineno}: {target.value.id}.{target.attr} = {value.s}")
                                value.s = "FILTERED"
                                
            # Check dictionary assignments
            elif isinstance(target, ast_comments.Subscript) and isinstance(target.value, ast_comments.Name) and isinstance(target.slice.value, str):
                
                for sensitive_string in self.sensitive_strings:
                    if sensitive_string in target.slice.value.lower():
                        #if isinstance(node.value, ast_comments.Str):
                        for value in ast_comments.walk(node.value):
                            if isinstance(value, ast_comments.Str):
                                print(f"Possible sensitive data found on line {target.value.lineno}: {target.value.id}[{target.slice.value}] = {node.value.s}")
                                node.value.s = "FILTERED"
        
        self.generic_visit(node)
               
class ImportsVisitor(ast_comments.NodeVisitor):
    def __init__(self):
        self.imports = {}
        self.scope = set()

    def visit_Import(self, node):
        if self.scope:
            for alias in node.names:
               self.imports[alias.name] = alias.asname or alias.name

    def visit_ImportFrom(self, node):
        if self.scope:
            module = node.module or ""
            for alias in node.names:
                name = alias.name
                asname = alias.asname or name
                full_name = f"{module}.{name}" if module else name
                self.imports[full_name] = asname

    def visit_FunctionDef(self, node):
        self.scope.add(node.name)
        self.generic_visit(node)
        self.scope.remove(node.name)

    def visit_ClassDef(self, node):
        self.scope.add(node.name)
        self.generic_visit(node)
        self.scope.remove(node.name)

    def visit_Call(self, node):
        if isinstance(node.func, ast_comments.Name):
            self.scope.add(node.func.id)
            self.generic_visit(node)
            self.scope.remove(node.func.id)
        elif isinstance(node.func, ast_comments.Attribute) and isinstance(node.func.value, ast_comments.Name):
            self.scope.add(node.func.value.id)
            self.generic_visit(node)
            self.scope.remove(node.func.value.id)


def extract_functions_classes_comments(filename):
    with open(filename, "r") as f:
        file = f.read()
    
    tree = ast_comments.parse(file)

    # filter out sensitive data
    sensitive_strings = [
        "password",
        "pwd",
        "api_key",
        "secret",
        "token",
        "private_key",
        "access_key",
        "credit_card",
        "social_security_number",
        "personal_identification_number",
        "passport_number",
        "license_number"
    ]
    sensitive_data_visitor = SensitiveDataVisitor(sensitive_strings)
    sensitive_data_visitor.visit(tree)
    filtered_file = ast_comments.unparse(tree)
    filtered_tree = ast_comments.parse(filtered_file)
    lines = filtered_file.splitlines()
    
    # extract classes, functions, comments and imports
    functions = []
    classes = []
    comments = []
    comments_by_function = {}
    comments_by_class = {}
    module_level_imports = {}
    current_class_name = None
    current_func_name = None

    for node in filtered_tree.body:
        
        if isinstance(node, ast_comments.Import):
            for alias in node.names:
                module_level_imports[alias.name] = alias.asname or alias.name
        
        elif isinstance(node, ast_comments.ImportFrom):
            module = node.module or ""
            for alias in node.names:
                name = alias.name
                asname = alias.asname or name
                full_name = f"{module}.{name}" if module else name
                module_level_imports[full_name] = asname

                
        # node is a function
        elif isinstance(node, ast_comments.FunctionDef):
            
            # get imports
            visitor = ImportsVisitor()
            visitor.scope.add(node.name)
            visitor.visit(node)
            
            # get function name
            func_name = node.name
            current_func_name = func_name
            current_class_name = None
            if current_func_name not in comments_by_function:
                comments_by_function[current_func_name] = []
            
            # get docstring and source code, append to the functions list
            docstring = ast_comments.get_docstring(node)
            if docstring:
                start_pos, end_pos = node.body[0].lineno - 1, node.body[0].end_lineno 
                func_body = lines[node.lineno:start_pos] + lines[end_pos:node.end_lineno]
                functions.append((func_name, func_body, docstring.strip(), visitor.imports, node.lineno, node.end_lineno))
            else:
                func_body = ast_comments.get_source_segment(file, node).strip() # get code
                functions.append((func_name, func_body, None, visitor.imports, node.lineno, node.end_lineno))
                
            # iterate over subnodes to find comments
            for subnode in node.body:
                if isinstance(subnode, ast_comments.Comment):
                    comment_text = subnode.value[1:].strip()
                    comments_by_function[current_func_name].append(comment_text)
       
       # free standing comments
        elif isinstance(node, ast_comments.Comment):
            comment_text = node.value[1:].strip()
            comments.append(comment_text)
            
        # classes
        elif isinstance(node, ast_comments.ClassDef):
            class_name = node.name

            current_class_name = class_name
            current_func_name = None
            
            class_body = []
            for subnode in node.body:
                if isinstance(subnode, ast_comments.FunctionDef):
                    
                    # get function name
                    func_name = subnode.name
                    current_func_name = func_name
                    current_class_name = class_name
                    if class_name not in comments_by_class:
                        comments_by_class[class_name] = {}
                    if current_func_name not in comments_by_class[class_name]:
                        comments_by_class[class_name][current_func_name] = []
                        
                    # get imports
                    visitor = ImportsVisitor()
                    visitor.scope.add(subnode.name)
                    visitor.visit(subnode)
                    
                    # get docstring and source code, append to the functions list
                    docstring = ast_comments.get_docstring(subnode)
                    if docstring:
                        start_pos, end_pos = subnode.body[0].lineno - 1, subnode.body[0].end_lineno 
                        func_body = lines[subnode.lineno:start_pos] + lines[end_pos:subnode.end_lineno]
                        functions.append((f"{class_name}.{func_name}", func_body, docstring.strip(), visitor.imports, subnode.lineno, subnode.end_lineno))
                    else:
                        func_body = ast_comments.get_source_segment(file, subnode).strip()
                        functions.append((f"{class_name}.{func_name}", func_body, None, visitor.imports, subnode.lineno, subnode.end_lineno))
                    
                    # iterate over subnodes to find comments inside function
                    for subsubnode in subnode.body:
                        if isinstance(subsubnode, ast_comments.Comment):
                            comment_text = subsubnode.value[1:].strip()
                            comments_by_class[class_name][current_func_name].append(comment_text)
                            
                # iterate over subnodes to find comments inside class
                elif isinstance(subnode, ast_comments.Comment):
                    comment_text = subnode.value[1:].strip()
                    class_comments = comments_by_class.setdefault(current_class_name, {})
                    class_func_comments = class_comments.setdefault("", [])
                    class_func_comments.append(comment_text)
                    
                else:
                    class_body.append(ast_comments.get_source_segment(file, subnode).strip())
                    
            classes.append((class_name, "\n".join(class_body)))
            current_func_name = None
            current_class_name = None

    # Convert comments_by_function and comments_by_class dictionaries to lists of tuples
    comments_by_function = [(func_name, comments) for func_name, comments in comments_by_function.items()]
    comments_by_class = [(class_name, funcs) for class_name, funcs in comments_by_class.items()]

    return functions, classes, comments, comments_by_function, comments_by_class, module_level_imports


# Example usage
filename = "/home/ruzickal/Code/Fingerprints/fp-research/fp-research/fingerprint/Essential/Config.py"
functions, classes, comments, comments_by_function, comments_by_class, module_level_imports = extract_functions_classes_comments(filename)
# print("Functions:", functions)
# print("Comments:", comments)
# print("Classes:", classes)
# print("Imports:", imports)
# print("Comments by function:", comments_by_function)
# print("Comments by class:", comments_by_class)

Possible sensitive data found on line 11: with pwd = 123
import json
from typing import Any, Optional
from datetime import datetime
import logging
import yaml
import sqlite3
import os
pwd = 'FILTERED'
logger = logging.getLogger('fingerprint.Config')

class Config:

    def __init__(self, run_id, auto_save_path: Optional[str]=None, yaml=True, save_at_init=True, **kwargs) -> None:
        """
        Initialize the configuration object.

        Args:
            auto_save_path (str): path to where the config file will be automatically saved. Set
            to None to avoid auto saving.
        """
        self.kwargs = kwargs
        self.kwargs['date'] = datetime.now().strftime('%d.%m.%Y %H:%M')
        self.auto_save_path = auto_save_path
        self.yaml = yaml
        self.run_id = run_id
        # save to file
        if not self.auto_save_path is None and save_at_init:
            self.to_file(self.auto_save_path)

    def __repr__(self) -> str:
        output = f'ID: {self.run_

In [37]:
module_level_imports

{'json': 'json',
 'typing.Any': 'Any',
 'typing.Optional': 'Optional',
 'datetime.datetime': 'datetime',
 'logging': 'logging',
 'yaml': 'yaml',
 'sqlite3': 'sqlite3',
 'os': 'os'}

In [38]:
print([f[3] for f in functions])



In [39]:
comments_by_class

[('Config',
  {'__init__': ['save to file'],
   '__repr__': [],
   '__getitem__': ['if not self.auto_save_path is None:',
    'self.to_file(self.auto_save_path)'],
   '__setitem__': [],
   '__delitem__': [],
   '__contains__': [],
   '__iter__': [],
   'to_dict': [],
   '': ['return self.kwargs'],
   'to_file': [],
   'to_file_sql': [],
   'to_file_yml': [],
   'save': [],
   'write_versioned': [],
   'from_file': [],
   'from_file_yml': [],
   'from_file_sql': ['return last config'],
   'get_run_ids_sql': [],
   'get_run_ids_yml': [],
   'now': []})]

In [3]:
comments_by_function

[]

In [4]:
comments_by_class

[('Config',
  {'__init__': ['save to file'],
   '__repr__': [],
   '__getitem__': ['if not self.auto_save_path is None:',
    'self.to_file(self.auto_save_path)'],
   '__setitem__': [],
   '__delitem__': [],
   '__contains__': [],
   '__iter__': [],
   'to_dict': [],
   '': ['return self.kwargs'],
   'to_file': [],
   'to_file_sql': [],
   'to_file_yml': [],
   'save': [],
   'write_versioned': [],
   'from_file': [],
   'from_file_yml': [],
   'from_file_sql': ['return last config'],
   'get_run_ids_sql': [],
   'get_run_ids_yml': [],
   'now': []})]

In [5]:
classes

[('Config', '')]

# Others

In [38]:
import re

# Define regular expressions to match authentication tokens and credentials

# git lab 
token_pattern = r'(https://[^:]+:)[^@]+(@.*)'
username_password_pattern = r'(https://)([^:]+):([^@]+)@(.*)'

# general configuration
username_pattern = r'username\s*=\s*[^\s]+'
password_pattern = r'password\s*=\s*[^\s]+'
apikey_pattern = r'apikey\s*=\s*[^\s]+'

combined_pattern = re.compile(f"{token_pattern}|{username_password_pattern}|{username_pattern}|{password_pattern}|{apikey_pattern}")

# Sample string containing authentication information
sample_string = 'pip install --extra-index-url=https://__token__:MY_TOKEN@example.com'

# Replace token
sample_string = re.sub(combined_pattern, r'\g<1>[REPLACED]\g<2>', sample_string)

## Replace username and password
#sample_string = re.sub(combined_pattern, r'\g<1>[REPLACED_USERNAME]:[REPLACED_PASSWORD]\g<2>', sample_string)

# Print updated string
print(sample_string)


pip install --extra-index-url=https://__token__:[REPLACED]@example.com
