In [38]:
import os
import pandas as pd
from config import Config
from evaluators.perfect_evaluator import PerfectEvaluator
from server.query_util import MigrationInfo

log_path = '/Users/usiusi/Documents/Report/Test-Reuse/Reports/report22/qlogs'
logs = []
config = Config()

In [39]:
def read_log(log_path):
    for path, subdirs, files in os.walk(log_path):
        for name in files:
            if 'csv' in name and not 'raw' in name:
                log = pd.read_csv(os.path.join(path, name))
                logs.append(log)
    return pd.concat(logs).fillna('')


def select_config(df, sm_config):
    criteria = (df['word_embedding'] == sm_config['word_embedding']) & (df['algorithm'] == sm_config['algorithm']) & \
               (df['training_set'] == sm_config['training_set']) &\
               (df['descriptors'] == sm_config['descriptors']) & (df['src_class'] != 'Button')
    results = df[criteria]
    return results[~results['src'].str.contains('a6|a7|a8')]


def remove_duplicated_q(df: pd.DataFrame):
    df['index'] = df['index'].astype(float)
    df.sort_values(by=['index'], inplace=True, ascending=False)
    columns = ['src', 'target','task', 'src_id', 'src_text', 'src_content_desc', 'target_id', 'target_text',
               'target_content_desc']
    return df.drop_duplicates(subset=columns, keep='first')


def select_top_scored(df: pd.DataFrame):
    df['score'] = pd.to_numeric(df['score'])
    df = df[df['score']>0]
    return df.groupby('index').agg('max')

def get_sm_configs():
    sm_configs = pd.read_csv('config_sample.csv')
    sm_configs['unique_q'] = ''
    sm_configs['correct_q'] = ''
    sm_configs['gt_exist'] = ''
    sm_configs['ratio_corr_ext'] = ''
    return sm_configs

def count_q_in_gt(selected_q, top_scored_q):
    correct_count = 0
    exist_gt_count = 0
    for index, row in top_scored_q.iterrows():
        mig_info = MigrationInfo.set_info_from_log(row)
        correct_count+=PerfectEvaluator.event_exist_in_gt(row, mig_info)
    for index, row in selected_q.iterrows():
        mig_info = MigrationInfo.set_info_from_log(row)
        exist_gt_count+=PerfectEvaluator.event_exist_in_gt(row, mig_info)
    return correct_count, exist_gt_count


def add_mig_info_to_series(name, result_for_mig):
    result_for_mig['src'] = name[0]
    result_for_mig['target'] = name[1]
    result_for_mig['task'] = name[2]
    return result_for_mig


def get_migration_groups(sm):
    selected_q = select_config(total, sm)
    selected_q = remove_duplicated_q(selected_q)
    top_scored_q = select_top_scored(selected_q)
    group_by = ['src', 'target', 'task']
    top_score_groups = top_scored_q.reset_index().groupby(group_by)
    selected_q_groups = selected_q.reset_index().groupby(group_by)
    return selected_q_groups, top_score_groups

def calc_migration_metrics(selected_q_group, sm, top_group):
    correct_count, exist_gt_count = count_q_in_gt(selected_q_group, top_group)
    sm['unique_q'] = top_group.shape[0]
    sm['correct_q'] = correct_count
    sm['gt_exist'] = exist_gt_count
    sm['ratio_corr_ext'] = correct_count / exist_gt_count if exist_gt_count else 0
    result_for_mig = pd.Series(data=sm)
    return result_for_mig


In [3]:
total = read_log(log_path)
sm_configs = get_sm_configs()

In [None]:

def calc_gt_log_per_config():
    for index, sm in sm_configs.iterrows():
        selected_q = select_config(total, sm)
        selected_q = remove_duplicated_q(selected_q)
        top_scored_q = select_top_scored(selected_q)
        correct_count, exist_gt_count = count_q_in_gt(selected_q, top_scored_q)
        sm['unique_q'] = top_scored_q.shape[0]
        sm['correct_q'] = correct_count
        sm['gt_exist'] = exist_gt_count
        sm['ratio_corr_ext'] = correct_count/exist_gt_count
        sm_configs.to_csv('atm_gt.csv', index=False)


In [4]:



results_for_a_config = []
for index, sm in sm_configs.iterrows():
    selected_q_groups, top_score_groups = get_migration_groups(sm)
    for name, top_group in top_score_groups:
        selected_q_group = selected_q_groups.get_group(name)
        result_for_mig = calc_migration_metrics(selected_q_group, sm, top_group)
        result_for_mig = add_mig_info_to_series(name, result_for_mig)
        results_for_a_config.append(result_for_mig)

config_df_result = pd.DataFrame(results_for_a_config)
config_df_result.to_csv('mig_lvl_results_craft.csv', index=False)


In [43]:
def get_craft_gt():
    gt_table = pd.read_csv(config.ground_truth).fillna('')
    craft_gt = gt_table[~gt_table['src_app'].str.contains('Ex|Sh|No')].copy()
    craft_gt['src_app'] = craft_gt.apply(lambda x: x['src_app'].split('b')[0], axis=1)
    craft_gt['task'] = craft_gt.apply(lambda x: 'b' + x['target_app'].split('b')[1], axis=1)
    craft_gt['target_app'] = craft_gt.apply(lambda x: x['target_app'].split('b')[0], axis=1)
    craft_gt.rename(columns={'src_app': 'src', 'target_app': 'target'}, inplace=True)
    return craft_gt


def get_atm_gt():
    gt_table = pd.read_csv(config.ground_truth).fillna('')
    atm_gt = gt_table[gt_table['src_app'].str.contains('Ex|Sh|No')].copy()
    atm_craft_map = {'ExpenseTracker': 'a6', 'NoteTaking': 'a7', 'ShoppingList': 'a8'}
    for k in atm_craft_map.keys():
        atm_gt['src_app'] = atm_gt.apply(lambda x: x['src_app'].replace(k,atm_craft_map[k]), axis=1)
        atm_gt['target_app'] = atm_gt.apply(lambda x: x['target_app'].replace(k,atm_craft_map[k]), axis=1)
    atm_gt['task'] = atm_gt.apply(lambda x: 'b' + x['target_app'][1] + '1', axis=1)
    atm_gt.rename(columns={'src_app': 'src', 'target_app': 'target'}, inplace=True)
    return atm_gt


def group_gt_by_migration(gt_table, subjects):
    results_for_a_config = []
    group_by = ['src', 'target', 'task']
    gt_groups = gt_table.reset_index().groupby(group_by)
    for name, group in gt_groups:
        count_events_of_mig = group.shape[0]
        gt_mig_info = {'correct_q': count_events_of_mig}
        result_for_mig = add_mig_info_to_series(name, gt_mig_info)
        results_for_a_config.append(result_for_mig)

    config_df_result = pd.DataFrame(results_for_a_config)
    config_df_result['mig_name'] = config_df_result['src']+ ' - ' + config_df_result['target']+ ' - ' + config_df_result['task']
    config_df_result[['mig_name', 'correct_q']].to_csv(f'gt_lvl_results_{subjects}.csv', index=False)

craft_gt = get_craft_gt()
atm_gt = get_atm_gt()

# atm_gt_mig_lvl = group_gt_by_migration(atm_gt, 'atm')
craft_gt_mig_lvl = group_gt_by_migration(craft_gt, 'craft')






KeyboardInterrupt: 