In [11]:
import pandas as pd

# Read the original dataset utilized when conducting their predictions
original_dataset = pd.read_csv("commit_all_features.csv")

# Read the file generated in the original "feature_extraction.ipynb", but including the 
# test refactoring only commit data.
commit_level_features_w_test = pd.read_csv('commit_level_features_w_test.csv')


In [12]:
all_columns = set(original_dataset.columns)
columns_w_test_data = set(commit_level_features_w_test.columns)

# Find differences in columns
columns_missing_test_data = all_columns - columns_w_test_data
columns_only_in_commit = columns_w_test_data - all_columns

print(f"Columns missing test data: {columns_missing_test_data}")


Columns missing test data: {'num_touched_before', 'id', 'dev_ref_com_exp', 'dev_ref_exp', 'age'}


In [13]:
# Create a new column 'TR' based on 'test'
commit_level_features_w_test['TR'] = commit_level_features_w_test['test'].apply(lambda x: 1 if pd.isna(x) or 'True' in str(x) else 0)

# Create a new column 'SR' based on 'test
commit_level_features_w_test['SR'] = commit_level_features_w_test['test'].apply(lambda x: 0 if pd.isna(x) else 1)

# Drop the columns 'test' and 'label'
commit_level_features_w_test.drop(columns=['test'], inplace=True)

print(commit_level_features_w_test)



                      commit_id  refactoryCount  leftLocationCount  \
0      5a85632d77d2b433df71b6b2               9                 19   
1      5a8562cc77d2b433dc711f29               2                  4   
2      5a8562cc77d2b433df712a61              35                 47   
3      5a85633977d2b433df71c388               7                 10   
4      5a85633a77d2b433df71c3a3              12                 22   
...                         ...             ...                ...   
53824  6076b764617b52e8e406de9d              11                 14   
53825  6076b765617b52e8e406df47               3                  5   
53826  6076b766617b52e8e406dfb8               4                  7   
53827  6076b766617b52e8e406dfc5              10                 13   
53828  6076b767617b52e8e406e08b               1                  2   

       rightLocationCount  leftLocationDiff  rightLocationDiff  \
0                      29               173                191   
1                       4  

In [14]:
# Check so the ratios seem accurate
ratio_1s_SR = commit_level_features_w_test['SR'].mean()
ratio_1s_TR = commit_level_features_w_test['TR'].mean()

print("Ratio of success in TR: ", ratio_1s_TR, "\nRatio of success in SR: ", ratio_1s_SR)

Ratio of success in TR:  0.2566274684649538 
Ratio of success in SR:  0.9228482788088205


In [15]:
# Save dataset in csv file. This csv includes the data utilized in the predictions, with test code refactoring
# data included for the columns possible to retrieve easily
commit_level_features_w_test.to_csv('commit_level_features_w_test_transformed.csv', index=False)

In [16]:
# Columns used in the smaller DAG, showcasing the total effect
columns_to_include = ['TR', 'SR', 'rightLocationDiff', "rightLocationCount"]

In [17]:
# Creating a new dataframe including only the data for the columns just defined
columns_total_effect = {col: commit_level_features_w_test[col] for col in columns_to_include}
commit_level_features_w_test_subset = pd.DataFrame(columns_total_effect)

print(commit_level_features_w_test_subset)


       TR  SR  rightLocationDiff  rightLocationCount
0       1   0                191                  29
1       1   0                 57                   4
2       1   0                561                  55
3       1   1                 99                  10
4       1   1                219                  22
...    ..  ..                ...                 ...
53824   0   1                201                  11
53825   0   1                171                   5
53826   0   1                316                   8
53827   0   1                180                  13
53828   0   1                 95                   3

[53829 rows x 4 columns]


In [18]:
# Save a csv file, similar to the previous but with only the four columns wanted for the total effect model
commit_level_features_w_test_subset.to_csv('commit_level_features_w_test_transformed_small.csv', index=False)