# Reads the class tags in all xml files and creates data frame (bucketDf)
- bucketId, nlines, similarity

In [1]:
from lxml import etree
import pandas as pd
import sys 
from ast import literal_eval
sys.path.append('../types/')
from cloneTypes import Bucket, CloneFragment
import pickle

In [2]:
def parseFilePath(pathToParse):
    pathTokens = pathToParse.split("/")
    userName = pathTokens[2].split("_")[0]
    repoName = "_".join(pathTokens[2].split("_")[1:])
    return {"repoName": repoName, "userName": userName, "fileName": pathTokens[3]}

In [3]:
def parse_xml_to_objects(xml_content):
    # Parse the XML content
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_content, parser)
    root = tree.getroot()

    buckets = []

    # Iterate through each <class> tag
    for clone_tag in root.findall('.//class'):
        # Create a bucket object
        bucket = Bucket(int(clone_tag.get('classid')),int(clone_tag.get('nlines')), int(clone_tag.get('similarity')))

        # Iterate through each <source> tag within the <clone> tag
        for source_tag in clone_tag.findall('.//source'):
            # Create a cloneFragment object and add it to the clone
            cloneMeta = parseFilePath(source_tag.get('file'))
            cloneFrag = CloneFragment(
                cloneMeta["repoName"],
                cloneMeta["userName"],
                cloneMeta["fileName"],
                int(source_tag.get('startline')),
                int(source_tag.get('endline')),
                source_tag.get('pcid')
            )
            bucket.add_cloneFrag(cloneFrag)

        buckets.append(bucket)

    return buckets


In [4]:
import pandas as pd
from lxml import etree

def parse_xml_to_dataframe(xml_content):
    # Initialize parser
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(xml_content, parser)
    root = tree.getroot()

    # List to hold all clone fragment records
    records = []

    # Iterate through each <class> tag
    for clone_tag in root.findall('.//class'):
        class_id = int(clone_tag.get('classid'))
        nlines = int(clone_tag.get('nlines'))
        similarity = int(clone_tag.get('similarity'))

        # Iterate through each <source> tag within the <class> tag
        for source_tag in clone_tag.findall('.//source'):
            # Parse file path
            cloneMeta = parseFilePath(source_tag.get('file'))

            # Dictionary to represent the row for this clone fragment
            record = {
                'classid': class_id,
                'nlines': nlines,
                'similarity': similarity,
                'repoName': cloneMeta["repoName"],
                'userName': cloneMeta["userName"],
                'fileName': cloneMeta["fileName"],
                'startline': int(source_tag.get('startline')),
                'endline': int(source_tag.get('endline')),
                'pcid': source_tag.get('pcid')
            }
            records.append(record)

    # Create DataFrame from the records
    df = pd.DataFrame(records)
    return df


In [5]:
buckets = parse_xml_to_objects("../GHData_functions-blind-clones/GHData_functions-blind-clones-0.30-classes.xml")

In [6]:
buckets_df = parse_xml_to_dataframe("../GHData_functions-blind-clones/GHData_functions-blind-clones-0.30-classes.xml")

In [8]:
buckets_df

Unnamed: 0,classid,nlines,similarity,repoName,userName,fileName,startline,endline,pcid
0,1,34,74,Flask-PyTorch-Chatbot,koenry,chat.py,5,50,32
1,1,34,74,AI-Chat-Bot-Using-PyTorch,abdulghaffaransari,processor.py,10,57,20150
2,2,21,70,NSRR-PyTorch,IMAC-projects,train.py,21,56,33
3,2,21,70,StarNet-PyTorch,ModelBunker,train.py,20,49,22957
4,2,21,70,torch_challenge,pcmin03,train.py,21,56,12996
...,...,...,...,...,...,...,...,...,...
11837,3325,15,93,DeepVis-PredDiff-PyTorch,VikHerr,main.py,104,127,66114
11838,3326,21,77,ProgGAN-PyTorch,ConnorJL,ProgGAN.py,186,209,66292
11839,3326,21,77,ProgGAN-PyTorch,ConnorJL,ProgGAN.py,286,313,66296
11840,3327,22,86,ProgGAN-PyTorch,ConnorJL,ProgGAN.py,210,234,66293


In [9]:
with open('./parsed/buckets_df.pkl', 'wb') as f:
    pickle.dump(buckets_df, f)