In [9]:
import pickle
from tqdm.auto import tqdm
import os
import pandas as pd
from collections import defaultdict
import multiprocessing


In [6]:
from Commit import CommitFactory
Commit = CommitFactory()

In [7]:
RAW_DATA = pd.read_csv("../data/all_apache_commits.csv")

RAW_DATA = RAW_DATA.loc[RAW_DATA['diff_line_count'] <= 50]
RAW_DATA = RAW_DATA.loc[RAW_DATA['files_changed'] <= 8]

REPO_LOOKUP = defaultdict(list)

In [None]:
repos = set(list(RAW_DATA["repo"]))

#Create a "clones" directory in order to clone local repos
if not os.path.exists("../clones"):
    os.makedirs("../clones")

# Clone each "repo" into the "clones" folder
for repo in tqdm(repos):
    if not os.path.exists(f"../clones/{repo}"):
        os.system(f"git clone https://github.com/{repo.replace('-', '/')}.git ../clones/{repo}")

In [8]:
print("Total commits found", len(RAW_DATA))
DATA_SLICE = [0,100]

RAW_DATA_SLICE = RAW_DATA.iloc[DATA_SLICE[0]: DATA_SLICE[1]]

TUPLES = [(row['sha'], row['repo']) for i, row in RAW_DATA_SLICE.iterrows()]
TUPLES = list(set(TUPLES))

#Creates a lookup dictionary where any commit SHA can be looked up to grab the Commit object with all the data, + bag of paths
COMMIT_DATA_LOOKUP = defaultdict(list)

Total commits found 266493


In [None]:
def _handle_errors(e):
    print("BIG ERROR, WE SHOULD NEVER GET HERE", e)
    return None

def _to_commit_mp(pair):
    try:
        sha = pair[0]
        repo = pair[1]

        commit = Commit(sha, f"../clones/{repo}")
        commit._populate_commit_info()  
        commit._generate_bags_of_contexts()
    except Exception as e:
        return str(e)


    return commit

def create_lookup_mp(pairs, data_slice):
    results = []
    
    pool = multiprocessing.Pool(processes=8)

    print("APPENDING JOBS...")
    for pair in pairs:
        #print(pair)
        result = pool.apply_async(_to_commit_mp, args=(pair,), error_callback=_handle_errors)
        results.append(result)

    print("FINISHED APPENDING JOBS")

    # Wait for all jobs to finish and collect the results
    final_results = {}
    errorList = []
    num_finished = 0
    num_errors = 0
    num_saved = 0
    save_step = 10
    for i, result in tqdm(enumerate(results), desc="Processing commit", total=len(results)):
        c = result.get()
        if type(c) == str:
            print("error in pool job",i,":", c)
            num_errors += 1
            errorList.append(pairs[i])
        else:
            final_results[c.sha] = c

            ## Need to save the subslice to a pickle and clear final_results
            if(num_finished == save_step):
                
                
                with open('../data/commit_lookups/commit_data_lookup' +
                          str(DATA_SLICE[0] + (save_step * (num_saved))) + "-" + str(DATA_SLICE[0] + (save_step * (num_saved + 1))) +
                          '.pickle', 'wb') as file:
                    pickle.dump(final_results, file, protocol=pickle.HIGHEST_PROTOCOL)

                final_results = {}

                num_saved += 1
                num_finished = 0

                
                
            
        num_finished += 1   

    
    with open('../data/commit_lookups/commit_data_lookup' +
              str(DATA_SLICE[0] + (save_step * (num_saved))) + "-" + str(DATA_SLICE[1]) +
              '.pickle', 'wb') as file:
        pickle.dump(final_results, file, protocol=pickle.HIGHEST_PROTOCOL)

        
    pool.close()
    return final_results

COMMIT_DATA_LOOKUP = create_lookup_mp(TUPLES, DATA_SLICE)
print("DONE PROCESING COMMITS")