In [None]:
import pandas as pd
import json
import re

pd.options.display.max_rows = 200

In [3]:
names = json.load(open('./optuna_results/final/json/names.json'))
# Extract train and test names
train_names = names["train_names"]
test_names = names["test_names"]

In [4]:
# Function to categorize airfoils based on naming conventions
def categorize_airfoils(names):
    categories = {}
    for name in names:
        # Identify family based on common prefixes
        match = re.match(r"([a-zA-Z]+)", name)  # Extract alphabetic prefix
        family = match.group(1) if match else "Unknown"
        
        if family not in categories:
            categories[family] = []
        categories[family].append(name)
    
    return categories

In [None]:
train_categories = categorize_airfoils(train_names)
test_categories = categorize_airfoils(test_names)

train_df = pd.DataFrame([(family, len(names)) for family, names in train_categories.items()], columns=["Family", "Train Count"])
test_df = pd.DataFrame([(family, len(names)) for family, names in test_categories.items()], columns=["Family", "Test Count"])

merged_df = pd.merge(train_df, test_df, on="Family", how="outer").fillna(0).sort_values(by="Train Count", ascending=False)


In [None]:
merged_df.iterrows()

In [None]:
no_train_rep = []

for family in test_categories:
    for _, row in merged_df.iterrows():
        if row['Family'] == family:
            if row['Train Count'] == 0 and row['Test Count'] > 0:
                no_train_rep.append(row['Family'])

In [88]:
filtered_df = merged_df[
    (merged_df['Family'].isin(test_categories)) & 
    (merged_df['Train Count'] == 0) & 
    (merged_df['Test Count'] > 0)
]

no_train_rep = filtered_df['Family'].tolist()

In [None]:
no_train_rep

In [None]:
'fx63137' in train_categories['fx']

In [12]:
merged_df.reset_index(inplace=True, drop=True)

In [None]:
def compute_test_perc(df):
    df['Test Perc'] = df.apply(
        lambda row: f'{row['Test Count'] / (row['Train Count'] + row['Test Count']):.2%}' if (row['Train Count'] + row['Test Count']) > 0 else 0,
        axis=1
    )
    return df


def compute_train_perc(df):
    df['Train Perc'] = df.apply(
        lambda row: f'{row['Train Count'] / (row['Train Count'] + row['Test Count']):.2%}' if (row['Train Count'] + row['Test Count']) > 0 else 0,
        axis=1
    )
    return df

In [95]:
df = merged_df.copy()

In [None]:
df = df.pipe(compute_train_perc).pipe(compute_test_perc)

In [102]:
df['Family'] = df['Family'].apply(lambda x: x.upper())

In [103]:
df['Family'] = df['Family'].str.upper()

In [104]:
df.to_csv('names.csv')

In [None]:
merged_df