In [14]:
import pandas as pd

# Load the CSV file to check its contents and structure
file_path = './data/merged_with_type.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe and the data types of each column
data.head(), data.dtypes


(         Date            repo_name  new_contributors  \
 0  2020-01-01  vue-dropdown-filter               1.0   
 1  2020-09-01  vue-dropdown-filter               1.0   
 2  2020-07-01  vue-dropdown-filter               1.0   
 3  2017-01-01    aurelia-slickgrid               1.0   
 4  2018-01-01    aurelia-slickgrid               1.0   
 
    change_request_response_time_avg  issue_age_avg  code_change_lines_sum  \
 0                              0.00            0.0                  -34.0   
 1                              0.00            0.0                  -34.0   
 2                              0.00            0.0                  -34.0   
 3                              0.00            0.0                -8192.0   
 4                              2.32          257.5                -9327.0   
 
    issues_new  issues_and_change_request_active  code_change_lines_add  \
 0         0.0                               2.0                   14.0   
 1         0.0                      

In [18]:
from scipy.stats import mannwhitneyu

# List of numeric columns as provided
numeric_cols = ['new_contributors', 'change_request_response_time_avg', 'issue_age_avg', 'code_change_lines_sum',
                'issues_new', 'issues_and_change_request_active', 'code_change_lines_add', 'attention',
                'issue_comments', 'change_requests_accepted', 'change_request_age_avg', 'participants', 'bus_factor',
                'code_change_lines_remove', 'inactive_contributors', 'change_requests_reviews', 'activity',
                'change_request_resolution_duration_avg', 'issues_closed', 'change_requests', 'issue_response_time_avg',
                'issue_resolution_duration_avg', 'stars']

# Dictionary to store P-values
p_values = {}

# Calculate P-value using Mann-Whitney U test for each numeric column based on 'type'
for col in numeric_cols:
    group1 = data[data['type'] == 'normal'][col].dropna()
    group2 = data[data['type'] != 'normal'][col].dropna()
    result = mannwhitneyu(group1, group2, alternative='two-sided')
    p_values[col] = result.pvalue

# Convert dictionary to DataFrame
p_values_df = pd.DataFrame(list(p_values.items()), columns=['Feature', 'P-value'])

# Mapping provided features to more descriptive names
feature_mapping = {
    "new_contributors": "Number of New Contributors",
    "change_request_response_time_avg": "Average Response Time for Code Change Requests",
    "issue_age_avg": "Average Duration of Open Issues",
    "code_change_lines_sum": "Total Number of Lines Changed",
    "issues_new": "Number of Newly Raised Issues",
    "issues_and_change_request_active": "Number of Active Issues and Pull Requests",
    "code_change_lines_add": "Number of New Code Lines Added",
    "attention": "Number of Project Forks",
    "issue_comments": "Number of Comments on Issues",
    "change_requests_accepted": "Number of Accepted Pull Requests",
    "change_request_age_avg": "Average Duration of Open Pull Requests",
    "participants": "Number of Participants",
    "bus_factor": "Number of Key Contributors the Project Relies On",
    "code_change_lines_remove": "Number of Code Lines Deleted",
    "inactive_contributors": "Number of Long-Inactive Contributors",
    "change_requests_reviews": "Number of Reviews for Pull Requests",
    "activity": "Overall Activity Events of the Project",
    "change_request_resolution_duration_avg": "Average Time to Resolve Pull Requests",
    "issues_closed": "Number of Closed Issues",
    "change_requests": "Total Number of Pull Requests",
    "issue_response_time_avg": "Average Response Time for Issues",
    "issue_resolution_duration_avg": "Average Time to Resolve Issues",
    "stars": "Number of Stars the Project Has Received"
}

# Apply the mapping to the dataframe
p_values_df['Feature'] = p_values_df['Feature'].map(feature_mapping)

# Display the updated DataFrame
p_values_df



Unnamed: 0,Feature,P-value
0,Number of New Contributors,0.04143817
1,Average Response Time for Code Change Requests,0.8339074
2,Average Duration of Open Issues,5.47005e-08
3,Total Number of Lines Changed,0.007780115
4,Number of Newly Raised Issues,0.02106131
5,Number of Active Issues and Pull Requests,5.953238999999999e-19
6,Number of New Code Lines Added,6.339719e-25
7,Number of Project Forks,0.0006721739
8,Number of Comments on Issues,9.975407e-05
9,Number of Accepted Pull Requests,8.656965e-13


In [13]:
import pandas as pd
# Dictionary to store MWW statistics, P-values, and Cliff's Delta results
stats_results = {}

# Calculate MWW Statistic, P-value, and Cliff's Delta for 'type' column against each numeric column
for col in numeric_cols:
    group1 = data[data['type'] == 'normal'][col].dropna()
    group2 = data[data['type'] != 'normal'][col].dropna()
    mwu_result = mannwhitneyu(group1, group2, alternative='two-sided')
    delta = cliffs_delta(group1, group2)
    stats_results[col] = {
        # 'MWW Statistic': mwu_result.statistic,
        'P-value': mwu_result.pvalue,
        # 'Cliff\'s Delta': delta
    }


stats_results
# 转为dataframe
stats_results_df = pd.DataFrame(stats_results)
stats_results_df = pd.DataFrame(stats_results).transpose()

# Updated map for original feature to descriptive names
feature_mapping = {
    "new_contributors": "Number of New Contributors",
    "change_request_response_time_avg": "Average Response Time for Code Change Requests",
    "issue_age_avg": "Average Duration of Open Issues",
    "code_change_lines_sum": "Total Number of Lines Changed",
    "issues_new": "Number of Newly Raised Issues",
    "issues_and_change_request_active": "Number of Active Issues and Pull Requests",
    "code_change_lines_add": "Number of New Code Lines Added",
    "attention": "Number of Project Forks",
    "issue_comments": "Number of Comments on Issues",
    "change_requests_accepted": "Number of Accepted Pull Requests",
    "change_request_age_avg": "Average Duration of Open Pull Requests",
    "participants": "Number of Participants",
    "bus_factor": "Number of Key Contributors the Project Relies On",
    "code_change_lines_remove": "Number of Code Lines Deleted",
    "inactive_contributors": "Number of Long-Inactive Contributors",
    "change_requests_reviews": "Number of Reviews for Pull Requests",
    "activity": "Overall Activity Events of the Project",
    "change_request_resolution_duration_avg": "Average Time to Resolve Pull Requests",
    "issues_closed": "Number of Closed Issues",
    "change_requests": "Total Number of Pull Requests",
    "issue_response_time_avg": "Average Response Time for Issues",
    "issue_resolution_duration_avg": "Average Time to Resolve Issues",
    "stars": "Number of Stars the Project Has Received"
}
stats_results_df.index = stats_results_df.index.map(feature_mapping)

stats_results_df


Unnamed: 0,P-value
Number of New Contributors,0.04143817
Average Response Time for Code Change Requests,0.8339074
Average Duration of Open Issues,5.47005e-08
Total Number of Lines Changed,0.007780115
Number of Newly Raised Issues,0.02106131
Number of Active Issues and Pull Requests,5.953238999999999e-19
Number of New Code Lines Added,6.339719e-25
Number of Project Forks,0.0006721739
Number of Comments on Issues,9.975407e-05
Number of Accepted Pull Requests,8.656965e-13
