In [6]:
import pandas as pd
from typing import List, Tuple, Dict, Any
import subprocess
from threading import Thread
import copy

In [3]:
data_prefix: str = 'data'
repo_prefix: str = f'{data_prefix}/repos'

data_name: str = '500_sampled_no_code.parquet'

repo_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{data_name}', engine = 'pyarrow')

In [15]:
process_cnt: int = 10
thread_cnt: int = 20

threads: List[Any] = [[] for i in range(process_cnt)]
output_queue_str: List[Any] = [{} for i in range(process_cnt)]

processes: List[Any] = []
output_queue: List[Any] = []

In [26]:
# define template to crawl data
get_prev_commit_template: str = '''
cd ./{}/{}

git rev-parse {}^
'''

get_diff_2_commit_template: str = '''
cd ./{}/{}

git diff --name-only {} {}
'''

get_file_at_commit_template: str = '''
cd ./{}/{}

git show {}:{}
'''

get_diff_file_template: str = '''
cd ./{}/{}

git diff {}..{} -- {}
'''

necessary_cols: List[str] = [
    'id',
    'fromLib',
    'toLib',
    'repoName',
    'repoOwner',
    'repoSplitName',
    'startCommit',
    'endCommit',
    'fileName',
    'startCode',
    'endCode',
    'diff',
    'startCommitChanges',
    'endCommitChanges',
]
sample_template: Dict[str, Any] = {k: None for k in necessary_cols}
final_df: pd.DataFrame = pd.DataFrame(columns = necessary_cols)
# final_df.set_index(id)

def get_prev_commit(repo_prefix: str, repo_name: str, changed_commit: str) -> str:
    get_prev_commit_script: str = get_prev_commit_template.format(repo_prefix, repo_name, changed_commit)
    sub: subprocess.CompletedProcess = subprocess.run(get_prev_commit_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')

    prev_commit: str = sub.stdout.strip()

    return prev_commit

def get_diff_2_commit(repo_prefix: str, repo_name: str, commit1: str, commit2: str) -> List[str]:
    get_diff_2_commit_script: str = get_diff_2_commit_template.format(repo_prefix, repo_name, commit1, commit2)
    sub: subprocess.CompletedProcess = subprocess.run(get_diff_2_commit_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')

    diff: str = sub.stdout
    diff_files: List[str] = diff.split('\n')

    return diff_files

def get_start_end_commit_code(repo_prefix: str, repo_name: str, file_name: str, start_commit: str, end_commit: str) -> Tuple[str, str]:
    try:
        get_file_script: str = get_file_at_commit_template.format(repo_prefix, repo_name, start_commit, file_name)
        sub: subprocess.CompletedProcess = subprocess.run(get_file_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')
        start_commit_code: str = sub.stdout
    except Exception as e:
        start_commit_code: str = ''


    try:
        get_file_script: str = get_file_at_commit_template.format(repo_prefix, repo_name, end_commit, file_name)
        sub: subprocess.CompletedProcess = subprocess.run(get_file_script, shell = True, capture_output = True, encoding='utf-8', errors='ignore')
        end_commit_code: str = sub.stdout
    except Exception as e:
        end_commit_code: str = ''

    return start_commit_code, end_commit_code

def get_diff_file(repo_prefix: str, repo_name: str, file_name: str, start_commit: str, end_commit: str) -> str:
    get_diff_file_script: str = get_diff_file_template.format(repo_prefix, repo_name, start_commit, end_commit, file_name)
    sub: subprocess.CompletedProcess = subprocess.run(get_diff_file_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')
    diff: str = sub.stdout

    return diff

class get_file_by_thread(Thread):
    def __init__(self, _pid: int, id: int, repo_prefix: str, repo_name: str, file_name: str, start_commit: str, end_commit: str):
        Thread.__init__(self)

        self._pid = _pid
        self.id = id
        self.repo_prefix = repo_prefix
        self.repo_name = repo_name
        self.file_name = file_name
        self.start_commit = start_commit
        self.end_commit = end_commit

    def run(self):
        _pid = self._pid
        id = self.id
        repo_prefix = self.repo_prefix
        repo_name = self.repo_name
        file_name = self.file_name
        start_commit = self.start_commit
        end_commit = self.end_commit

        start_code, end_code = get_start_end_commit_code(repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                                                                start_commit = start_commit, end_commit = end_commit)

        diff = get_diff_file(repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                            start_commit = start_commit, end_commit = end_commit)

        output_queue_str[_pid][file_name] = [id, start_code, end_code, diff]

def str_normalize(x: str) -> str:
    if (x is None):
        return ''
    elif (len(x) == 0):
        return ''

    return x

def create_data_rows(samples: pd.DataFrame, repo_name: str, sample_template: Dict[str, Any] = sample_template, sample_cnt: int = 0) -> Tuple[int, pd.DataFrame]:
    sample_template.update({
        # 'id': sample_cnt,
        'repoName': repo_name,
        'fromLib': samples.iloc[0]['fromLib'],
        'toLib': samples.iloc[0]['toLib'],
        'repoOwner': samples.iloc[0]['repoOwner'],
        'repoSplitName': samples.iloc[0]['repoSplitName'],
        'startCommit': samples.iloc[0]['startCommit'],
        'endCommit': samples.iloc[0]['endCommit'],
        'startCode': '',
        'endCode': '',
        'startCommitChanges': samples.iloc[0]['startCommitChanges'],
        'endCommitChanges': samples.iloc[0]['endCommitChanges']
    })

    # get unique startCommit values for this repository's samples
    changed_commits: List[str] = samples['startCommit'].unique().tolist()

    res_df: pd.DataFrame = pd.DataFrame(columns = necessary_cols)

    # get the diff of each commit and its previous commib
    for commit_id in range(len(changed_commits)):
        changed_commit: str = changed_commits[commit_id]

        # get the previous commit hash and the diff
        prev_commit: str = get_prev_commit(repo_prefix = repo_prefix, repo_name = repo_name, changed_commit = changed_commit)
        diff_files: str = get_diff_2_commit(repo_prefix = repo_prefix, repo_name = repo_name,
                                        commit1 = changed_commit, commit2 = prev_commit)

        for file_name in diff_files:
            try:
                start_code, end_code = get_start_end_commit_code(repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                                                                start_commit = prev_commit, end_commit = changed_commit)
                diff = get_diff_file(repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                                    start_commit = prev_commit, end_commit = changed_commit)
            except Exception as e:
                print(e)
                print(f'file: {file_name}')
                print(f'start: {prev_commit}, end: {changed_commit}')
                print(f'start code: {start_code}')
                print(f'end code: {end_code}')
                print('-' * 50)

                return None

            sample_template['id'] = sample_cnt
            sample_template['fileName'] = file_name
            sample_template['startCode'], sample_template['endCode'] = str_normalize(start_code), str_normalize(end_code)
            sample_template['diff'] = diff

            res_df = pd.concat([res_df, pd.DataFrame([sample_template], columns = necessary_cols)], ignore_index = True)

            sample_cnt += 1

    return sample_cnt, res_df

def create_data_rows_by_thread(_pid: int, samples: pd.DataFrame, repo_name: str, sample_template: Dict[str, Any] = sample_template, sample_cnt: int = 0) -> Tuple[int, pd.DataFrame]:
    current_sample_template = copy.deepcopy(sample_template)
    current_sample_template.update({
        # 'id': sample_cnt,
        'repoName': repo_name,
        'fromLib': samples.iloc[0]['fromLib'],
        'toLib': samples.iloc[0]['toLib'],
        'repoOwner': samples.iloc[0]['repoOwner'],
        'repoSplitName': samples.iloc[0]['repoSplitName'],
        'startCommit': samples.iloc[0]['startCommit'],
        'endCommit': samples.iloc[0]['endCommit'],
        'startCode': '',
        'endCode': '',
        'startCommitChanges': samples.iloc[0]['startCommitChanges'],
        'endCommitChanges': samples.iloc[0]['endCommitChanges']
    })

    # get unique startCommit values for this repository's samples
    changed_commits: List[str] = samples['startCommit'].unique().tolist()

    res_df: pd.DataFrame = pd.DataFrame(columns = necessary_cols)

    threads[_pid] = []
    output_queue_str[_pid] = {}
    file_queue = []

    # get the diff of each commit and its previous commib
    for commit_id in range(len(changed_commits)):
        changed_commit: str = changed_commits[commit_id]

        # get the previous commit hash and the diff
        prev_commit: str = get_prev_commit(repo_prefix = repo_prefix, repo_name = repo_name, changed_commit = changed_commit)
        diff_files: str = get_diff_2_commit(repo_prefix = repo_prefix, repo_name = repo_name,
                                        commit1 = changed_commit, commit2 = prev_commit)

        for file_name in diff_files:
            thread: Thread = get_file_by_thread(_pid = _pid, id = sample_cnt, repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                                                start_commit = prev_commit, end_commit = changed_commit)
            threads[_pid].append(thread)
            file_queue.append(file_name)

            sample_cnt += 1

            if ((len(threads[_pid]) == thread_cnt) or (file_name == diff_files[-1])):
                for thread in threads[_pid]:
                    thread.start()

                for thread in threads[_pid]:
                    thread.join()

                for key in file_queue:
                    current_sample_template['id'] = output_queue_str[_pid][key][0]
                    current_sample_template['fileName'] = key
                    current_sample_template['startCode'], current_sample_template['endCode'] = str_normalize(output_queue_str[_pid][key][1]), str_normalize(output_queue_str[_pid][key][2])
                    current_sample_template['diff'] = str_normalize(output_queue_str[_pid][key][3])

                    res_df = pd.concat([res_df, pd.DataFrame([current_sample_template], columns = necessary_cols)], ignore_index = True)

                output_queue_str[_pid] = {}
                threads[_pid] = []
                file_queue = []

    return sample_cnt, res_df

class create_data_rows_by_process(Thread):
    def __init__(self, _pid: int, repo_name: str, samples: pd.DataFrame, sample_cnt: int = 0):
        Thread.__init__(self)

        self._pid = _pid
        self.repo_name = repo_name
        self.samples = samples
        self.sample_cnt = sample_cnt

    def run(self):
        _pid = self._pid
        repo_name = self.repo_name
        samples = self.samples
        sample_cnt = self.sample_cnt

        sample_cnt, res_df = create_data_rows_by_thread(_pid = _pid, samples = samples, repo_name = repo_name, sample_cnt = 0)

        output_queue[_pid] = res_df

        print(f'finished: {repo_name}')
        print('-' * 50)

In [None]:
# retrieve unique repository names
unique_repos: List[str] = repo_df['repoName'].unique().tolist()

sample_cnt: int = 0
for repo_name_id in range(len(unique_repos)):
    repo_name: str = unique_repos[repo_name_id]

    # filter the DataFrame for the current repository's samples
    samples: pd.DataFrame = repo_df[repo_df['repoName'] == repo_name]
    res_df: pd.DataFrame = None

    try:
        sample_cnt, res_df = create_data_rows(samples = samples, repo_name = repo_name, sample_cnt = sample_cnt)

        final_df = pd.concat([final_df, res_df], ignore_index = True)

        break
    except Exception as e:
        print(e)
        print(repo_name, samples.iloc[0]['id'])

        continue

In [25]:
final_df

Unnamed: 0,id,fromLib,toLib,repoName,repoOwner,repoSplitName,startCommit,endCommit,fileName,startCode,endCode,diff,startCommitChanges,endCommitChanges
0,0,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-benchmarks/pom.xml,"<project xmlns=""http://maven.apache.org/POM/4....","<project xmlns=""http://maven.apache.org/POM/4....",diff --git a/mangooio-benchmarks/pom.xml b/man...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
1,1,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-benchmarks/src/main/java/Main.java,\n\nimport io.mangoo.core.Application;\nimport...,,diff --git a/mangooio-benchmarks/src/main/java...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
2,2,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-benchmarks/src/test/java/mangooio/Tes...,,package mangooio;\n\nimport de.svenkubiak.embe...,diff --git a/mangooio-benchmarks/src/test/java...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
3,3,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-benchmarks/src/test/java/mangooio/Tes...,package mangooio;\n\nimport org.junit.BeforeCl...,,diff --git a/mangooio-benchmarks/src/test/java...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
4,4,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-benchmarks/src/test/java/mangooio/con...,package mangooio.controllers;\n\nimport static...,package mangooio.controllers;\n\nimport static...,diff --git a/mangooio-benchmarks/src/test/java...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,72,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-test/pom.xml,"<project xmlns=""http://maven.apache.org/POM/4....","<project xmlns=""http://maven.apache.org/POM/4....",diff --git a/mangooio-test/pom.xml b/mangooio-...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
73,73,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-test/src/main/java/io/mangoo/test/Sim...,package io.mangoo.test;\n\nimport org.junit.ru...,,diff --git a/mangooio-test/src/main/java/io/ma...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
74,74,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,mangooio-test/src/main/java/io/mangoo/test/Tes...,package io.mangoo.test;\n\nimport org.junit.Be...,package io.mangoo.test;\n\nimport org.junit.ju...,diff --git a/mangooio-test/src/main/java/io/ma...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....
75,75,junit:junit,org.junit.jupiter:junit-jupiter-params,svenkubiak_mangooio,svenkubiak,mangooio,3a6b91fa396307230aba6394ceb70ccf52cd4184,3a6b91fa396307230aba6394ceb70ccf52cd4184,pom.xml,"<project xmlns=""http://maven.apache.org/POM/4....","<project xmlns=""http://maven.apache.org/POM/4....",diff --git a/pom.xml b/pom.xml\nindex 34a9c3dc...,+org.junit.jupiter:junit-jupiter-params\n+org....,+org.junit.jupiter:junit-jupiter-params\n+org....


In [20]:
unique_repos: List[str] = repo_df['repoName'].unique().tolist()

for repo_name_id in range(len(unique_repos)):
    repo_name: str = unique_repos[repo_name_id]

    samples: pd.DataFrame = repo_df[repo_df['repoName'] == repo_name]
    res_df: pd.DataFrame = None

    _pid: int = repo_name_id % process_cnt
    proc: Thread = create_data_rows_by_process(_pid = _pid, repo_name = repo_name, samples = samples)
    processes.append(proc)

    if ((len(processes) == process_cnt) or (repo_name_id == len(unique_repos) - 1) or repo_name_id == 1):
        for proc in processes:
            proc.start()

        for proc in processes:
            proc.join()

        for res_df in output_queue:
            final_df = pd.concat([final_df, res_df], ignore_index = True)

        output_queue = []
        processes = []

        # final_df.to_parquet(f'{data_prefix}/first_dataset.parquet')

        print()
        print('()' * 25)
        print(' ' * 20 + f'finished: {repo_name_id + 1}/{len(unique_repos)}')
        print(' ' * 20 + f'len: {len(final_df)}')
        print(' ' * 20 + f'checkpointed!')
        print('()' * 25)
        print()

        break

finished: payara_Payara
--------------------------------------------------
finished: svenkubiak_mangooio
--------------------------------------------------

()()()()()()()()()()()()()()()()()()()()()()()()()
                    finished: 2/283
                    len: 244
                    checkpointed!
()()()()()()()()()()()()()()()()()()()()()()()()()



In [22]:
final_df

Unnamed: 0,id,fromLib,toLib,repoName,repoOwner,repoSplitName,startCommit,endCommit,fileName,startCode,endCode,diff,startCommitChanges,endCommitChanges
0,2,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,nucleus/admin/util/pom.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!--\n...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<!--\n...",diff --git a/nucleus/admin/util/pom.xml b/nucl...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
1,0,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,nucleus/admin/rest/rest-service/src/main/java/...,/*\n * DO NOT ALTER OR REMOVE COPYRIGHT NOTICE...,/*\n * DO NOT ALTER OR REMOVE COPYRIGHT NOTICE...,diff --git a/nucleus/admin/rest/rest-service/s...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
2,1,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,nucleus/admin/rest/rest-service/src/main/java/...,/*\n * DO NOT ALTER OR REMOVE COPYRIGHT NOTICE...,/*\n * DO NOT ALTER OR REMOVE COPYRIGHT NOTICE...,diff --git a/nucleus/admin/rest/rest-service/s...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
3,4,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,nucleus/admin/util/src/main/java/com/sun/enter...,/*\n * DO NOT ALTER OR REMOVE COPYRIGHT NOTICE...,/*\n * DO NOT ALTER OR REMOVE COPYRIGHT NOTICE...,diff --git a/nucleus/admin/util/src/main/java/...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
4,5,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,nucleus/packager/nucleus-jersey/pom.xml,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!--\n...","<?xml version=""1.0"" encoding=""UTF-8""?>\n<!--\n...",diff --git a/nucleus/packager/nucleus-jersey/p...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,70,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,mangooio-maven-plugin/pom.xml,"<project xmlns=""http://maven.apache.org/POM/4....","<project xmlns=""http://maven.apache.org/POM/4....",diff --git a/mangooio-maven-plugin/pom.xml b/m...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
240,68,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,mangooio-integration-test/src/test/java/io/man...,package io.mangoo.utils;\n\nimport static org....,package io.mangoo.utils;\n\nimport static org....,diff --git a/mangooio-integration-test/src/tes...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
241,74,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,mangooio-test/src/main/java/io/mangoo/test/Tes...,package io.mangoo.test;\n\nimport org.junit.Be...,package io.mangoo.test;\n\nimport org.junit.ju...,diff --git a/mangooio-test/src/main/java/io/ma...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...
242,75,org.codehaus.jackson:jackson-xc,com.fasterxml.jackson.jaxrs:jackson-jaxrs-base,payara_Payara,payara,Payara,01ae58600c4069cd4df5081d52e81e05411694d9,01ae58600c4069cd4df5081d52e81e05411694d9,pom.xml,"<project xmlns=""http://maven.apache.org/POM/4....","<project xmlns=""http://maven.apache.org/POM/4....",diff --git a/pom.xml b/pom.xml\nindex 34a9c3dc...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...,+com.fasterxml.jackson.jaxrs:jackson-jaxrs-jso...


In [21]:
print(final_df['diff'][0])

diff --git a/nucleus/admin/util/pom.xml b/nucleus/admin/util/pom.xml
index c6417d31ab..5da59f93b2 100755
--- a/nucleus/admin/util/pom.xml
+++ b/nucleus/admin/util/pom.xml
@@ -3,7 +3,7 @@
 
     DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 
-    Copyright (c) 1997-2013 Oracle and/or its affiliates. All rights reserved.
+    Copyright (c) 1997-2014 Oracle and/or its affiliates. All rights reserved.
 
     The contents of this file are subject to the terms of either the GNU
     General Public License Version 2 only ("GPL") or the Common Development
@@ -50,7 +50,7 @@
     </parent>
     <artifactId>admin-util</artifactId>
     <packaging>glassfish-jar</packaging>
-    
+
     <name>admin-util</name>
     <description>Admin Utilities</description>
 
@@ -174,8 +174,8 @@
             <scope>provided</scope>
         </dependency>
         <dependency>
-            <groupId>org.codehaus.jackson</groupId>
-            <artifactId>jackson-core-asl</artifactId>
+            <groupId

In [82]:
final_df.to_parquet(f'{data_prefix}/first_dataset.parquet')