In [None]:
import re
import pandas as pd
import statsmodels.formula.api as smf

from importlib import reload
import utils # in this directory

In [None]:
reload(utils) # in case changes were made to utils 

In [None]:
pit_mutations = pd.read_csv(utils.pit_mutations_path)
submissions = utils.getsubmissions(webcat_path=utils.webcat_path, users=pit_mutations.userName.unique(),
                                   assignments=['Project 2'])
mutators = utils.get_mutator_specific_data(pit_mutations=pit_mutations, submissions=submissions)
coverage = utils.all_mutator_data(mutators, 'cov')
survival = utils.all_mutator_data(mutators, 'surv')
agg_mutators = mutators.groupby('mutator').agg('mean')
del pit_mutations

In [None]:
def forward_selection(data, response):
    """Linear model designed by forward selection.
    Credit: https://planspace.org/20150423-forward_selection_with_statsmodels/
    
    Args:
        data (pd.DataFrame): All possible predictors and response
        response (str): Name of response column in `data`
        candidates (list): Candidate features

    Returns:
        model: an "optimal" fitted statsmodels linear model
               with an intercept
               selected by forward selection
               evaluated by adjusted R-squared
    """
    # add features in order of the specified sort_criteria
    candidates = list(data.columns)
    candidates.remove(response)
    remaining = candidates.copy() # keep candidates intact to measure savings
    selected = []
    current_score, best_new_score = 0, 0
    current_eff = 0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        eff_with_candidates = {} 
        for candidate in remaining:
            features = selected + [candidate]
            formula = '{} ~ {} + 1'.format(response, ' + '.join(features))
            
            # calculate r-squared score
            score = smf.ols(formula, data).fit().aic
            scores_with_candidates.append((score, candidate))
            
            # calculate incremental efficiency
            eff = utils.get_data_for_subset(mutators, subset=features, submissions=submissions)['eff'].mean()
            eff_with_candidates[candidate] = eff
            
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates[0]
        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    formula = '{} ~ {} + 1'.format(response, ' + '.join(selected))
    print('Selected {} / {} mutators'.format(len(selected), len(candidates)))
    
    selected_num = utils.get_data_for_subset(mutators, submissions=submissions, subset=selected)['num'].sum()
    original_num = utils.get_data_for_subset(mutators, submissions=submissions, subset=candidates)['num'].sum()
    print('{:.2%} of initial mutants'.format(selected_num / original_num))
    
    model = smf.ols(formula, data).fit()
    return model, selected

In [None]:
depvar = 'cov'
response = utils.get_data_for_subset(df=mutators, submissions=submissions, subset=utils.pit_sufficient)[depvar]
d = coverage[utils.pit_deletion] \
            .merge(response, right_index=True, left_index=True) \
            .fillna(0)
model, subset = forward_selection(d, depvar)
model.summary()