In [1]:
#default_exp ch1

# basic

The file wire.csv contains data relating the pull strength (pstren) of a wire bond (which we’ll treat as a response) to six characteristics which we shall treat as design variables: die height (dieh), post height (posth), loop height (looph), wire length (wlen), bond width on the die (diew), and bond width on the post (postw). (Derived from exercise 2.3 in Myers, Montgomery, and Anderson–Cook (2016) using data from Table E2.1.)

* Write code that converts natural variables in the file to coded variables in the unit hypercube. Also, normalize responses to have a mean of zero and a range of 1.
* Use model selection techniques to select a parsimonious linear model for the coded data including, potentially, second-order and interaction effects.
* Use the fitted model to make a prediction for pull strength, when the explanatory variables take on the values c(6, 20, 30, 90, 2, 2), in the order above, with a full accounting of uncertainty. Make sure the predictive quantities are on the original scale of the data.

In [2]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [3]:
#hide
from nbdev.showdoc import *
import pandas as pd
import numpy as np
from typing import List

In [24]:
def length(x: pd.Series):
    return np.sqrt(np.sum(np.square(x)))

def _predictor_to_unit_length(x: pd.Series) -> pd.Series:
    return x / length(x)


class DataManager:
    def __init__(self, response="postw", datafile="http://bobby.gramacy.com/surrogates/wire.csv"):
        self.data = pd.read_csv(datafile)
        self.data.columns = [s.strip() for s in self.data.columns]
        self.response: str = response
        self.predictors: List[str] = self._get_predictors()
        self.normalized = False
        

    
    def _get_predictors(self) -> List[str]:
        return [colname for colname in self.data.columns if colname != self.response]

    
    def predictors_to_hypercube(self):
        for colname in self.predictors:
            self.data[colname] = _predictor_to_unit_length(self.data[colname])
            
    def normalize_response(self):
        if self.normalized:
            return
        self.orig_mean = np.mean(self.data[self.response])
        self.orig_std = np.std(self.data[self.response])
        self.data[self.response] -= self.orig_mean
        self.data[self.response] /= self.orig_std
        self.normalized = True
        
    def make_quadratic_variables_wide(self) -> pd.DataFrame:
        interactions = {}
        for coli in self.data.columns:
            for colj in self.data.columns:
                interactions[f"{coli}*{colj}"] = self.data[coli] * self.data[coli]
        return pd.DataFrame(interactions)
    
    def make_quadratic_variables_long(self, wide=None):
        if wide is None:
            wide = self.make_quadratic_variables_wide()
        dfs = []
        for colname in wide.columns:
            df = pd.DataFrame({'val': wide[colname]})
            df['var'] = colname
            dfs.append(df)
        response_df = pd.DataFrame({'val': self.data[self.response]})
        response_df['var'] = self.response
        dfs.append(response_df)
        return pd.concat(dfs)[['var', 'val']]


In [25]:
dm = DataManager()
dm.data.head()

Unnamed: 0,pstren,dieh,posth,looph,wlen,diew,postw
0,8.0,5.2,19.6,29.6,94.9,2.1,2.3
1,8.3,5.2,19.8,32.4,89.7,2.1,1.8
2,8.5,5.8,19.6,31.0,96.2,2.0,2.0
3,8.8,6.4,19.4,32.4,95.6,2.2,2.1
4,9.0,5.8,18.6,28.6,86.5,2.0,1.8


---------
### Write code that converts natural variables in the file to coded variables in the unit hypercube. Also, normalize responses to have a mean of zero and a range of 1.

In [26]:
dm.predictors_to_hypercube()
dm.normalize_response()
dm.data.head()

Unnamed: 0,pstren,dieh,posth,looph,wlen,diew,postw
0,0.179626,0.205221,0.231706,0.210582,0.245585,0.230895,2.11661
1,0.186362,0.205221,0.234071,0.230502,0.232129,0.230895,-0.884554
2,0.190852,0.228901,0.231706,0.220542,0.24895,0.2199,0.315912
3,0.197588,0.25258,0.229342,0.230502,0.247397,0.24189,0.916145
4,0.202079,0.228901,0.219884,0.203468,0.223848,0.2199,-0.884554


In [27]:
# Test data standardization.
for colname in dm.predictors:
    x = dm.data[colname]
    assert x.max() <= 1    
    assert x.min() >= 0
    1 - length(x) <= 10e-7

---------
### Use model selection techniques to select a parsimonious linear model for the coded data including, potentially, second-order and interaction effects.

In [28]:
quadratics_wide = dm.make_quadratic_variables_wide()
quadratics_long = dm.make_quadratic_variables_long(quadratics_wide)

In [29]:
quadratics_wide.head()

Unnamed: 0,pstren*pstren,pstren*dieh,pstren*posth,pstren*looph,pstren*wlen,pstren*diew,pstren*postw,dieh*pstren,dieh*dieh,dieh*posth,...,diew*wlen,diew*diew,diew*postw,postw*pstren,postw*dieh,postw*posth,postw*looph,postw*wlen,postw*diew,postw*postw
0,0.032265,0.032265,0.032265,0.032265,0.032265,0.032265,0.032265,0.042116,0.042116,0.042116,...,0.053312,0.053312,0.053312,4.48004,4.48004,4.48004,4.48004,4.48004,4.48004,4.48004
1,0.034731,0.034731,0.034731,0.034731,0.034731,0.034731,0.034731,0.042116,0.042116,0.042116,...,0.053312,0.053312,0.053312,0.782435,0.782435,0.782435,0.782435,0.782435,0.782435,0.782435
2,0.036425,0.036425,0.036425,0.036425,0.036425,0.036425,0.036425,0.052395,0.052395,0.052395,...,0.048356,0.048356,0.048356,0.0998,0.0998,0.0998,0.0998,0.0998,0.0998,0.0998
3,0.039041,0.039041,0.039041,0.039041,0.039041,0.039041,0.039041,0.063797,0.063797,0.063797,...,0.058511,0.058511,0.058511,0.839321,0.839321,0.839321,0.839321,0.839321,0.839321,0.839321
4,0.040836,0.040836,0.040836,0.040836,0.040836,0.040836,0.040836,0.052395,0.052395,0.052395,...,0.048356,0.048356,0.048356,0.782435,0.782435,0.782435,0.782435,0.782435,0.782435,0.782435


In [30]:
quadratics_long

Unnamed: 0,var,val
0,pstren*pstren,0.032265
1,pstren*pstren,0.034731
2,pstren*pstren,0.036425
3,pstren*pstren,0.039041
4,pstren*pstren,0.040836
...,...,...
14,postw,-0.884554
15,postw,0.916145
16,postw,-0.884554
17,postw,-0.884554


In [31]:
(pn.ggpl)

NameError: name 'pn' is not defined