**RQ2: CREATE DATASETS FOR LOC, FANIN, LOOSE AND TIGHT CLASS COHESION (OR OTHER METRICS) FOR THE IDENTIFIED CATEGORIES**

In [None]:
import pandas as pd
import json

In [None]:
# Read in the systems list
with open('./groups/systems_list.json') as json_file:
    systems = json.load(json_file)

project_names = [x.lower() for x in list(systems.keys())]

# read in the categories
df = pd.read_csv('./groups/categories.csv')
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df

In [None]:
# Create a dictionary of the categories, calculate the percentage of projects that have that category
categories = {}
for column in df.columns:
    categories[column] = list(df[column].dropna().unique())

percentages = {}
for key, value in categories.items():
    percentage = round((len(value) / 46) * 100, 2)
    percentages[key] = percentage
print(percentages)

*Load Fede's metrics and adjust the index names*

In [None]:
loc_metric = pd.read_csv('./metrics/loc.csv', index_col=0)
fanin_metric = pd.read_csv('./metrics/fanin.csv', index_col=0)

# missing othe metrics to analyze

In [None]:
# function to rename the columns
from difflib import SequenceMatcher

def string_similarity(s1, s2):
    matcher = SequenceMatcher(None, s1, s2)
    similarity_ratio = matcher.ratio()
    return similarity_ratio

In [None]:
# rename the columns until no difference between the project names and the metric index names
for elem in project_names:
    for elem2 in loc_metric.index.values:
        similarity = string_similarity(elem.upper(), elem2.upper())
        if (similarity > 0.8):
            loc_metric = loc_metric.rename(index={elem2: elem})
    for elem3 in fanin_metric.index.values:
        similarity = string_similarity(elem.upper(), elem3.upper())
        if (similarity > 0.8):
            fanin_metric = fanin_metric.rename(index={elem3: elem})
        
loc_metric = loc_metric.rename(index={'graylog2-server': 'graylog', 'k-9': 'k-9 mail', 'hibernate orm': 'hibernate'})
fanin_metric = fanin_metric.rename(index={'graylog2-server': 'graylog', 'k-9': 'k-9 mail', 'hibernate orm': 'hibernate'})

*separate LOC metric for each group*

In [None]:
# separate metrics for each group
app_projects = df['App'].dropna().unique()
app_loc_metric = loc_metric.loc[app_projects]
app_loc_metric.to_csv('./groups/loc/app_loc.csv')

database_projects = df['Database'].dropna().unique()
database_loc_metric = loc_metric.loc[database_projects]
database_loc_metric.to_csv('./groups/loc/database_loc.csv')

nlp_projects = df['NLP (Machine Learning)'].dropna().unique()
nlp_loc_metric = loc_metric.loc[nlp_projects]
nlp_loc_metric.to_csv('./groups/loc/nlp_loc.csv')

ide_projects = df['IDE / Development tool'].dropna().unique()
ide_loc_metric = loc_metric.loc[ide_projects]
ide_loc_metric.to_csv('./groups/loc/ide_loc.csv')

desk_projects = df['Desktop Application'].dropna().unique()
desk_loc_metric = loc_metric.loc[desk_projects]
desk_loc_metric.to_csv('./groups/loc/desk_loc.csv')

web_projects = df['Web Framework'].dropna().unique()
web_loc_metric = loc_metric.loc[web_projects]
web_loc_metric.to_csv('./groups/loc/web_loc.csv')

lib_projects = df['Library'].dropna().unique()
lib_loc_metric = loc_metric.loc[lib_projects]
lib_loc_metric.to_csv('./groups/loc/lib_loc.csv')

cicd_projects = df['CI/CD'].dropna().unique()
cicd_loc_metric = loc_metric.loc[cicd_projects]
cicd_loc_metric.to_csv('./groups/loc/cicd_loc.csv')

*separate FANIN metric for each group*

In [None]:
# separate metrics for each group
app_projects = df['App'].dropna().unique()
app_fanin_metric = fanin_metric.loc[app_projects]
app_fanin_metric.to_csv('./groups/fanin/app_fanin.csv')

database_projects = df['Database'].dropna().unique()
database_fanin_metric = fanin_metric.loc[database_projects]
database_fanin_metric.to_csv('./groups/fanin/database_fanin.csv')

nlp_projects = df['NLP (Machine Learning)'].dropna().unique()
nlp_fanin_metric = fanin_metric.loc[nlp_projects]
nlp_fanin_metric.to_csv('./groups/fanin/nlp_fanin.csv')

ide_projects = df['IDE / Development tool'].dropna().unique()
ide_fanin_metric = fanin_metric.loc[ide_projects]
ide_fanin_metric.to_csv('./groups/fanin/ide_fanin.csv')

desk_projects = df['Desktop Application'].dropna().unique()
desk_fanin_metric = fanin_metric.loc[desk_projects]
desk_fanin_metric.to_csv('./groups/fanin/desk_fanin.csv')

web_projects = df['Web Framework'].dropna().unique()
web_fanin_metric = fanin_metric.loc[web_projects]
web_fanin_metric.to_csv('./groups/fanin/web_fanin.csv')

lib_projects = df['Library'].dropna().unique()
lib_fanin_metric = fanin_metric.loc[lib_projects]
lib_fanin_metric.to_csv('./groups/fanin/lib_fanin.csv')

cicd_projects = df['CI/CD'].dropna().unique()
cicd_fanin_metric = fanin_metric.loc[cicd_projects]
cicd_fanin_metric.to_csv('./groups/fanin/cicd_fanin.csv')

**DESCRIPTIVE STATISTICAL TESTS**