In [None]:
!pip install opendatasets
import opendatasets as od
od.download(
    "https://www.kaggle.com/datasets/dhruvildave/github-commit-messages-dataset")

In [None]:
# Load the CSV
import pandas as pd
df = pd.read_csv('github-commit-messages-dataset/full.csv', usecols=["author", "message"])

init_num_authors = 100

In [None]:
#Order by number of contributions by author
df = df.assign(new =df['author'].map(df['author'].value_counts())).sort_values(['new','author'], ascending=[False, True]).drop('new', axis=1)

In [None]:
#Remove bots and accounts representing multiple people
words = ["auto", "queue", "admin", "Gardener", "robot", "noreply", "TensorFlower"]

df = df[df['author'].str.contains('(' + '|'.join(words) + ')')==False] #ignore the error

In [None]:
#Only keep the first n authors' commits (makes the email removal step a looot faster)
df = df.head(sum(df["author"].value_counts()[:init_num_authors]))

In [None]:
#Remove all sign-off emails
#Specifically, any line with the following: "<email>"

import re

totaldocs = sum(df["author"].value_counts())
pattern = ".*(<(?:(?!.*?[.]{2})[a-zA-Z0-9](?:[a-zA-Z0-9.+!%-]{1,64}|)|\"[a-zA-Z0-9.+!% -]{1,64}\")@[a-zA-Z0-9][a-zA-Z0-9.-]+(.[a-z]{2,}|.[0-9]{1,})>).*\n?"
matches = 0

loops = 0
for i, row in df.iterrows():
    if loops%10000==0: print(f"{loops} / {totaldocs}")
    if re.search(pattern, str(row["message"])):
        df["message"][i] = re.sub(pattern, '', str(row["message"]))
        matches += 1
    
    loops += 1

print(f"{matches} matches")

In [None]:
#Remove messages that are too short
df = df[df['message'].apply(lambda x: len(str(x))>60)]

In [None]:
df

In [None]:
#Limit the data

df = pd.read_csv('github-commit-messages-dataset/processed.csv', usecols=["author", "message"])
df = df.assign(new =df['author'].map(df['author'].value_counts())).sort_values(['new','author'], ascending=[False, True]).drop('new', axis=1)

num_authors = 100
num_commits_per_author = 200 #≤printed value

print(list(df['author'].value_counts())[num_authors-1])


if (df['author'].value_counts()[num_authors-1]) >= num_commits_per_author:
    df = df.head(sum(df["author"].value_counts()[:num_authors]))
    df = df.groupby("author").head(num_commits_per_author).reset_index(drop=True).head(num_authors*num_commits_per_author)
    
df

In [None]:
#Export to files
import os

dirname = "github_100_200_very_clean"

for i, row in df.iterrows():
    
    testrain = "test" if i%2==0 else "train"
    
    path = f"{dirname}/{testrain}/{row['author']}"

    if not os.path.exists(path):
      os.makedirs(path)

    filename = f"{i}.txt"
    f = open(f"{os.path.join(path, filename)}", "w")
    f.write(str(row["message"]))
    f.close()
    
    if (i+1)%1000==0:print(f"{i+1}/{num_authors*num_commits_per_author}")