In [1]:
import git
import tempfile
from tqdm import tqdm
import json


In [14]:
class gitRepoData:

    def __init__(self, repo_url, repo_name, repo_description, repo_branch, repo_files):
        self.repo_url = repo_url
        self.repo_name = repo_name
        self.repo_description = repo_description
        self.repo_branch = repo_branch
        self.repo_features = []
        self.repo_files = repo_files

    def to_dict(self):
        return {
            "repo_url": self.repo_url,
            "repo_name": self.repo_name,
            "repo_description": self.repo_description,
            "repo_branch": self.repo_branch,
            "repo_features": self.repo_features,
            "repo_files": [file.to_dict() for file in self.repo_files],
            "repo_status": "raw"
        }


class gitRepoFiles:

    def __init__(self, folder_structure, content):
        self.folder_structure = folder_structure
        self.description = {}
        self.variables = []
        self.functions = []
        self.imports = []
        self.props = []
        self.action = {}
        self.developer_comments = {}
        self.tester_comments = {}
        self.known_issues = {}
        self.content = content
        self.status = "raw"
        self.description_status = "raw"
        self.variables_status = "raw"
        self.functions_status = "raw"
        self.imports_status = "raw"
        self.props_status = "raw"

   

    def to_dict(self):
        return {
            "folder_structure": self.folder_structure,
            "description": self.description,
            "variables": self.variables,
            "functions": self.functions,
            "import": self.imports,
            "props": self.props,
            "action": self.action,
            "developer_comments": self.developer_comments,
            "tester_comments": self.tester_comments,
            "known_issues": self.known_issues,
            "content": self.content,
            "status": self.status,
            "description_status": self.description_status,
            "variables_status": self.variables_status,
            "functions_status": self.functions_status,
            "imports_status": self.imports_status,
            "props_status": self.props_status,
        }

def extract_repo_files(repo):
    data = []
    processed_files = []
    total_file_count = 0
    processed_file_count = 0
    skipped_file_count = 0

    try:
        for blob in repo.tree().traverse():
            if blob.type == 'blob':  
                total_file_count += 1
                try:
                    file_content = blob.data_stream.read().decode("utf-8")
                    data.append(gitRepoFiles(blob.path, file_content))
                    processed_files.append(blob.path)
                    processed_file_count += 1
                    
                except Exception as file_error:
                    print(f"Error processing file {blob.path}")
                    skipped_file_count += 1

    except Exception as e:
        print(f"Error traversing the repository tree: {e}")
        

    finally:
        print(f"\nProcessed {processed_file_count} files from {total_file_count}\n")


    return data, total_file_count, processed_file_count, skipped_file_count


In [15]:
repo_details = [
    {"repo_url": "REPO URL", "dir": "DIR NAME", "branch_name": "BRANCH NAME"},
]


In [None]:
repo_data = []
final_total_file_count = []
final_processed_file_count = []
final_skipped_file_count = []

for repo in tqdm(repo_details):
    repo_dir = tempfile.mkdtemp()
    repo_clone = git.Repo.clone_from(repo["repo_url"], repo_dir, branch=repo["branch_name"])
    repo_files, total_file_count, processed_file_count, skipped_file_count = extract_repo_files(repo_clone)
    repo_data.append(gitRepoData(repo["repo_url"], repo["dir"], "", repo["branch_name"], repo_files))
    final_total_file_count.append(total_file_count)
    final_processed_file_count.append(processed_file_count)
    final_skipped_file_count.append(skipped_file_count)

print(f"total_file_count: {sum(final_total_file_count)} \n processed_file_count: {sum(final_processed_file_count)} \n skipped_file_count: {sum(final_skipped_file_count)}")

repo_data_dicts = [repo.to_dict() for repo in repo_data]

# Save the list of dictionaries as a JSON file
with open('repo_data_sample.json', 'w') as json_file:
    json.dump(repo_data_dicts, json_file, indent=4)
