In [11]:
import pandas as pd
import numpy as np

In [12]:
data_prefix: str = 'data'

test_data = pd.read_csv(f'{data_prefix}/500_test_sample_dataset.csv')
log_data = pd.read_csv(f'{data_prefix}/500_log_sample_dataset.csv')
all_data = pd.read_csv(f'{data_prefix}/2000_code_samples_dataset.csv')

In [13]:
others_data =  pd.read_parquet(f'{data_prefix}/500_others_sample_dataset.parquet')

In [18]:
def process_diff(diff: str):
    changes = []

    # Initialize counters for current section
    current_added_count = 0
    current_removed_count = 0
    section_identifier = None  # To store the section from the @@ line
    section_count = 0  # To keep track of the number of @@ sections
    lib_changes_count = 0

    for line in diff.splitlines():
        if line.startswith('@@'):
            # If we reach a new change section, append previous counts if any
            if current_added_count > 0 or current_removed_count > 0:
                changes.append({
                    'section_count': section_count,
                    'section_identifier': section_identifier,
                    'added_count': current_added_count,
                    'removed_count': current_removed_count,
                    'line_change': current_added_count - current_removed_count,
                    'lib_changes': lib_changes_count
                })
                # Reset counts for the new section
                current_added_count = 0
                current_removed_count = 0
                lib_changes_count = 0

            # Extract the old and new line numbers from the @@ line
            parts = line.split()
            old_line_info = parts[1]  # e.g., -16,14
            new_line_info = parts[2]  # e.g., +16,14

            # Get the old and new line numbers
            old_start = int(old_line_info.split(',')[0][1:])  # Skip the '-'
            new_start = int(new_line_info.split(',')[0][1:])  # Skip the '+'

            # Store the start line numbers
            section_identifier = {
                'old_start': old_start,
                'new_start': new_start
            }
            section_count += 1

        elif line.startswith('+'):
            # Count added lines
            current_added_count += 1
            if "import" in line:
                lib_changes_count += 1

        elif line.startswith('-'):
            # Count removed lines
            current_removed_count += 1
            if "import" in line:
                lib_changes_count += 1

    # Append any remaining counts after the last section
    if current_added_count > 0 or current_removed_count > 0:
        changes.append({
            'section_count': section_count,
            'section_identifier': section_identifier,
            'added_count': current_added_count,
            'removed_count': current_removed_count,
            'line_change': current_added_count - current_removed_count,
            'lib_changes': lib_changes_count
        })

    return changes


def statistic_dataframe(df):
    # return number of file based on startCommit, endCommit and sort on repoName (same repoName near each other)
    unique_commit_repo_mapping = (
        df.groupby(['startCommit', 'endCommit'])
        .agg(repoName=('repoName', 'first'), count=('repoName', 'size'))
        .reset_index()
        .sort_values(by='repoName')
    )

    # return repos that have more than 1 startCommit and endCommit changes
    repoName_counts = unique_commit_repo_mapping['repoName'].value_counts()
    repos_more_than_once_counts = repoName_counts[repoName_counts > 1]

    # file created after commit
    new_file_df = df[
        (pd.isna(df['startCode'])) & (pd.notna(df['endCode']))
    ]

    # file deleted after commit
    deleted_df = df[
        (pd.notna(df['startCode'])) & (pd.isna(df['endCode']))
    ]

    # diff analyze
    remain_df = df.merge(new_file_df, how='outer', on=df.columns.tolist(), indicator=True)
    remain_df = remain_df[remain_df['_merge'] == 'left_only'].drop(columns='_merge')
    remain_df = remain_df.merge(deleted_df, how='outer', on=df.columns.tolist(), indicator=True)
    remain_df = remain_df[remain_df['_merge'] == 'left_only'].drop(columns='_merge')
    remain_df['total_added'] = 0
    remain_df['total_removed'] = 0
    remain_df['total_position'] = 0
    remain_df['detailed_changes'] = None
    for index, row in remain_df.iterrows():
        total_added = 0
        total_removed = 0
        total_position = 0
        total_lib_change = 0
        diff = row['diff']
        changes = process_diff(diff)
        total_position = len(changes)
        for change in changes:
            if change['section_identifier'] != None:
                total_added += change['added_count']
                total_removed += change['removed_count']
                total_lib_change += change['lib_changes']
        remain_df.at[index, 'total_added'] = total_added
        remain_df.at[index, 'total_removed'] = total_removed
        remain_df.at[index, 'total_position'] = total_position
        remain_df.at[index, 'detailed_changes'] = changes
        if (total_added + total_removed == 0):
            remain_df.at[index, 'lib_percentage'] = 0
        else:
            remain_df.at[index, 'lib_percentage'] = total_lib_change / (total_added + total_removed)

    # library analyze
    unique_lib_mapping = (
        df.groupby(['fromLib', 'toLib'])
        .size()  # This counts the number of occurrences for each combination
        .reset_index(name='count')  # Reset index and name the count column
        .sort_values(by='count', ascending=False)  # Sort by count in descending order
    )

    # Return results
    return {
        'unique_commit_repo_mapping': unique_commit_repo_mapping,
        'repos_more_than_once_counts': repos_more_than_once_counts,
        'new_file_count': new_file_df,
        'deleted_file_count': deleted_df,
        'remain_df': remain_df,
        'unique_lib_mapping': unique_lib_mapping
    }

In [None]:
result = statistic_dataframe(test_data)
# pd.set_option('display.max_row', None)
result['new_file_count']['repoName'].unique()

AttributeError: 'DataFrame' object has no attribute 'unique'