In [2]:
import pandas as pd

In [3]:
DATASET_PATH = "data/mock1.csv" 

In [4]:
import pandas as pd
import random
from datetime import datetime, timedelta
import json

def generate_mock_commit_data(num_commits):
    """
    Generates mock data for a set of Git commits.

    Args:
        num_commits (int): The number of mock commits to generate.

    Returns:
        pandas.DataFrame: A DataFrame with mock commit data.
    """

    data = []
    commit_index = {}
    # Generate unique commit hashes
    commit_hashes = [f"{i:07x}" for i in range(num_commits)]

    generated_parent = False

    for i in range(num_commits):
        commit = commit_hashes[i] 

        # Parents:
        num_parents = random.choice([1,2]) if generated_parent else 0
        if num_parents == 0:
            generated_parent = True
        parents = []
        if num_parents > 0 and i > 0:
            # Ensure parents are from previously generated commits
            possible_parents = commit_hashes[:i]
            parents = random.sample(possible_parents, min(num_parents, len(possible_parents)))
        
        # Author and Commit Time:
        # Start from a recent past date and go backwards
        if num_parents == 0:
            commit_time = datetime.now() - timedelta(days=random.randint(0, 365), 
                                                    hours=random.randint(0, 23), 
                                                    minutes=random.randint(0, 59))
        else:
            parent_time = commit_index[parents[-1]] 
            commit_time = parent_time + timedelta(days=random.randint(0, 15), 
                                                    hours=random.randint(0, 23), 
                                                    minutes=random.randint(0, 59))
        author_time = commit_time - timedelta(minutes=random.randint(0, 60)) # Author time can be slightly before commit time

        commit_index[str(commit)] = commit_time

        # Plus/Minus Diff:
        insertions = random.randint(0, 500)
        deletions = random.randint(0, 300)

        # Tags (Author/Committer):
        people = [x + y + z for x in ["Alice", "Bob", "Charlie", "David", "Eve"] for y in ["Alice", "Bob", "Charlie", "David", "Eve"] for z in ["Alice", "Bob", "Charlie", "David", "Eve"] ]
        authors = people
        committers = people
        tags = {
            "author": random.choice(authors),
            "committer": random.choice(committers[:25])
        }

        data.append({
            "commit": commit,
            "parents": parents,
            "author_time": author_time,
            "commit_time": commit_time,
            "insertions": insertions,
            "deletions": deletions,
            "tags": json.dumps(tags)  # Store JSON as a string
        })

    df = pd.DataFrame(data)
    return df

In [5]:
generate_mock_commit_data(500).to_csv(DATASET_PATH,index=False)

In [6]:
loaded_mock = pd.read_csv(DATASET_PATH)
loaded_mock['tags']

0      {"author": "DavidDavidDavid", "committer": "Al...
1      {"author": "DavidCharlieEve", "committer": "Al...
2      {"author": "EveDavidEve", "committer": "AliceB...
3      {"author": "EveBobBob", "committer": "AliceCha...
4      {"author": "AliceBobEve", "committer": "AliceA...
                             ...                        
495    {"author": "DavidAliceDavid", "committer": "Al...
496    {"author": "DavidAliceCharlie", "committer": "...
497    {"author": "AliceDavidBob", "committer": "Alic...
498    {"author": "BobAliceCharlie", "committer": "Al...
499    {"author": "CharlieAliceBob", "committer": "Al...
Name: tags, Length: 500, dtype: object