In [None]:
import re
import sys
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multcomp
import matplotlib.pyplot as plt


from importlib import reload
import utils # in this directory

%matplotlib inline

In [None]:
reload(utils) # in case changes were made to utils 

In [None]:
pit_mutations = pd.read_csv(utils.pit_mutations_path)
submissions = utils.getsubmissions(webcat_path=utils.webcat_path, users=pit_mutations.userName.unique(),
                                   assignments=['Project 2'])
mutators = utils.get_mutator_specific_data(pit_mutations=pit_mutations, submissions=submissions)
coverage = utils.all_mutator_data(mutators, 'cov')
survival = utils.all_mutator_data(mutators, 'surv')
agg_mutators = mutators.groupby('mutator').agg(['mean', 'std'])
joined = utils.get_main_subset_data(mutators, submissions=submissions)
del pit_mutations # I dub thee memory efficient

In [None]:
def forward_selection(data, response):
    """Linear model designed by forward selection.
    Credit: https://planspace.org/20150423-forward_selection_with_statsmodels/
    
    Args:
        data (pd.DataFrame): All possible predictors and response
        response (str): Name of response column in `data`
        candidates (list): Candidate features

    Returns:
        model: an "optimal" fitted statsmodels linear model
               with an intercept
               selected by forward selection
               evaluated by adjusted R-squared
    """
    # add features in order of the specified sort_criteria
    candidates = list(data.columns)
    candidates.remove(response)
    remaining = candidates.copy() # keep candidates intact to measure savings
    selected = []
    maxint = sys.maxsize
    current_score, best_new_score = maxint, maxint
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            features = selected + [candidate]
            formula = '{} ~ {} + 1'.format(response, ' + '.join(features))
            
            aic = smf.ols(formula, data).fit().bic
            scores_with_candidates.append((aic, candidate))
            
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates[0]
        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    formula = '{} ~ {} + 1'.format(response, ' + '.join(selected))
    print('Selected {} / {} mutators'.format(len(selected), len(candidates)))
    
    selected_num = utils.get_data_for_subset(mutators, submissions=submissions, subset=selected)['num'].sum()
    original_num = utils.get_data_for_subset(mutators, submissions=submissions, subset=candidates)['num'].sum()
    print('{:.2%} of initial mutants'.format(selected_num / original_num))
    
    model = smf.ols(formula, data).fit()
    return model, selected

In [None]:
depvar = 'full_cov'
response = joined[depvar]
d = coverage[utils.pit_deletion] \
        .merge(response, right_index=True, left_index=True) \
        .fillna(0)
model, subset = forward_selection(d, depvar)
print('{:.2f}'.format(model.rsquared_adj), subset)

In [None]:
smf.ols('{} ~ {} + 1'.format(depvar, ' + '.join(subset[:3])), data=d).fit().summary()

In [None]:
def inc_subset_data(subsetlist):
    prev = None
    for i in range(1, len(subsetlist) + 1):
        subset_data = utils.get_data_for_subset(mutators, subset=subsetlist[:i], submissions=submissions)
        subset_data = subset_data[['cov', 'mpl']]
        subset_data['mpl'] = subset_data['mpl']
        subset_data.columns = ['Mutation Coverage', '# Mutants per KLoC']
        subset_data['Operator Subset'] = 'Subset {}'.format(i)
        if prev is None:
            prev = subset_data
        else:
            prev = pd.concat([prev, subset_data])
    return prev

In [None]:
inc_num = 2
s = subset[:inc_num]
s1 = utils.get_data_for_subset(mutators, subset=s, submissions=submissions)
s1['runningtime'] = utils.get_running_time(resultfile=utils.pit_results_path + '/inc-{}-results.ndjson'.format(inc_num))
s1['mpl'] = s1['mpl'] * 1000
s1 = s1.merge(right=joined, right_index=True, left_index=True)
print('Subset = ', s)
# '{:.2%}'.format((s1['runningtime'] / (joined['deletion_runningtime'])).mean())

sns.lmplot(x='cov', y='full_cov', data=s1)

In [None]:
deletion = joined.filter(regex='^deletion_[mpl|cov]')
deletion.columns = ['Mutation Coverage', '# Mutants per KLoC']
deletion['Operator Subset'] = 'Deletion'

sufficient = joined.filter(regex='^sufficient_[mpl|cov]')
sufficient.columns = ['Mutation Coverage', '# Mutants per KLoC']
sufficient['Operator Subset'] = 'Sufficient'

full = joined.filter(regex='^full_[mpl|cov]')
full.columns = ['Mutation Coverage', '# Mutants per KLoC']
full['Operator Subset'] = 'Full'

inc_subsets = inc_subset_data(subset)

s = pd.concat([deletion, inc_subsets])
s['# Mutants per KLoC'] = s['# Mutants per KLoC'] * 1000
ax = sns.scatterplot(x='# Mutants per KLoC', y='Mutation Coverage', hue='Operator Subset', data=s, style='Operator Subset')
ax.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
sns.despine()