In [None]:
import os
import json
import subprocess

def read_file(file):
    try:
        with open(file, 'r', encoding='utf-8') as f:
            content = f.read()
    except:
        print(f"File Open Error:")
        content = ''
    return content

def write_file(file_path, content):
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
    except:
        print(f"File Write Error")

def checkout_file_before_commit(parent_hash, file, sanitized_path, directory, cnt):
    subprocess.run(['git', 'checkout', parent_hash, '--', file])
    before_content = read_file(file)
    file_path = os.path.join('..', 'commit-files', f'{directory}', f'{cnt}_before_{sanitized_path}')
    write_file(file_path, before_content)
    subprocess.run(['git', 'checkout', 'HEAD', '--', file])
    
def checkout_file_after_commit(commit_hash, file, sanitized_path, directory, cnt):
    subprocess.run(['git', 'checkout', commit_hash, '--', file])
    after_content = read_file(file)
    file_path = os.path.join('..', 'commit-files', f'{directory}', f'{cnt}_after_{sanitized_path}')
    write_file(file_path, after_content)   
    subprocess.run(['git', 'checkout', 'HEAD', '--', file])
    
def diff_file_of_commit(commit_hash, file, sanitized_path, directory, cnt):
    hash_path = commit_hash+":"+file
    # numOfLine = subprocess.run(['git', 'show', hash_path, "|", "wc", "-l"], shell=True, stdout=subprocess.PIPE, text=True).stdout.rstrip()
    numOfLine = 9999
    line_option = "-U"+str(numOfLine)
    file_path = os.path.join('..', 'commit-files', f'{directory}', f'{cnt}_diff_{sanitized_path}')
    file_option = "--output="+file_path
    parent_hash = commit_hash+"~1"
    subprocess.run(['git', 'diff', line_option, file_option, parent_hash, commit_hash, "--", file])
    # print("git diff", line_option, file_option, parent_hash, commit_hash, "--", file)
    
    
def file_tracker(directories, output_directory):
    
    # If the output_directory directory does not exist, create it.
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    root = os.getcwd()
    
    for directory in directories:
        
        os.chdir(output_directory)
        if not os.path.exists(directory):
            os.makedirs(directory)
        os.chdir("..")
        
        commits = []
        log_file = os.path.join(os.getcwd(), "commit-logs", f"{directory}-files-log.json")
         
        # Go to the repository directory
        os.chdir(directory) 
        
        with open(log_file, "r", encoding='utf-8') as f:
            data = f.read()
            raw_data = r''+data
            try:
                commits = json.loads(raw_data)
#                 print(commits)
            except json.JSONDecodeError as e:
                print(f"JSON Decoding Error: {e}")

        cnt=0
        for commit in commits:
            if commit['changed_file_list'] != []:
                cnt += 1
                # print(cnt)
            else:
                continue

            for file in commit['changed_file_list']:
                # print(file)
                sanitized_path = file.replace("/", "_")
            
                # Choose your job to do
                checkout_file_before_commit(commit['parent'], file, sanitized_path, directory, cnt)
                checkout_file_after_commit(commit['commitHash'], file, sanitized_path, directory, cnt)
                diff_file_of_commit(commit['commitHash'], file, sanitized_path, directory, cnt)
                
        print(directory, "files creation complete")
        os.chdir("..")
    print("All files have been created")

In [None]:
if __name__ == "__main__":
    directories = ["h2database", "bc-java", "pgjdbc", "junit4", "gson", "guava"]
    output_directory = "commit-files"
    # commit_logger(directories)
    file_tracker(directories, output_directory)

In [None]:
os.getcwd()

In [None]:
os.chdir("..")