In [1]:
import pandas as pd

from config import Config
from descriptor_processes.load_data import clean_id
from descriptor_processes.text_pre_process import pre_process

important_filed = ['text', 'content-desc', 'resource-id', 'activity']
config = Config()


# %%

def remove_oracles(events):
    return events[events['event_type'] != 'oracle'].reset_index()


def remove_unimportant_columns(events):
    return events[['class', 'content-desc', 'text', 'activity', 'resource-id']]


def add_type(events, type):
    events.rename(columns={'content-desc': 'content_desc', 'resource-id': 'id'}, inplace=True)
    for col in events.columns:
        events.rename(columns={col: type + '_' + col}, inplace=True)
    return events


def clean_test(events, type):
    # events = remove_oracles(events)
    events = remove_unimportant_columns(events)
    events = add_type(events, type)
    return events


def clean_df(df):
    return df


def get_mig_events(series, type, subjects):
    app_name = series[type + '_app']
    events = pd.read_json(subjects + '_gt/' + app_name + '.json').fillna('')
    events = clean_test(events, type)
    events[type + '_app'] = app_name.replace('-', '')
    return events


In [2]:
craft_map = pd.read_csv('craft_map.csv')
s_list = []
for i, series in craft_map.iterrows():
    src_events = get_mig_events(series, 'src', 'craft')
    target_events = get_mig_events(series, 'target', 'craft')
    src_index = series['src_index']
    target_index = series['target_index']
    src_event = src_events.iloc[src_index]
    target_event = target_events.iloc[target_index]
    row = pd.concat([src_event, target_event], axis=0)
    s_list.append(row)

df = pd.concat(s_list, axis=1).T.reindex()
df = clean_df(df)
craft_events = df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [3]:
def get_mig_events_atm(series, type):
    mig_filename = series['src_app'] + '-' +series['target_app']
    events = pd.read_json('atm_gt/' + mig_filename + '.json').fillna('')
    events = clean_test(events, type)
    events[type + '_app'] = series['target_app']
    return events

atm_map = pd.read_csv('atm_map.csv')

s_list = []
for i, series in atm_map.iterrows():
    src_events = get_mig_events(series, 'src', 'atm')
    target_events = get_mig_events_atm(series, 'target')
    src_index = series['src_index']
    target_index = series['target_index']

    src_event = src_events.iloc[src_index]
    try:
        target_event = target_events.iloc[target_index]

    except Exception as e:
        print()
    row = pd.concat([src_event, target_event], axis=0)
    s_list.append(row)

df = pd.concat(s_list, axis=1).T.reindex()
df = clean_df(df)
atm_events = df.reset_index(drop=True)
total_gt = pd.concat([craft_events, atm_events])

In [4]:
cleaning_columns = ['target_text', 'target_content_desc', 'target_id','src_text', 'src_content_desc', 'src_id']
total_gt.loc[:, ['target_id', 'src_id']] = total_gt.loc[:, ['target_id', 'src_id']].applymap(lambda s: clean_id(s))
total_gt = total_gt.astype(str)
total_gt.loc[:, cleaning_columns] = pre_process(total_gt.loc[:, cleaning_columns], False)
total_gt.to_csv(config.ground_truth, index=False)