### Lets do a more full scale analysis of our data with a few different types of models

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import xgboost as xgb

First, we'll extract the data for a single run. This should feature 10 different gates

In [None]:
df = pd.read_csv('data/run1.csv')
df.head(2)

Unnamed: 0,gate_type,voltage,thickness,clock_cycle,t_rise,t_fall,t_delay
0,buffer,0.3,0.1,100,3.131324e-12,3.17279e-12,9.921646e-12
1,buffer,0.4,0.1,100,8.473565e-13,1.456597e-12,7.269524e-12


In [None]:
unique_gates = list(df.gate_type.unique())
np.array(unique_gates)

array(['buffer', 'inv_x1', 'inv_x2', 'inv_x4', 'nand2', 'nand3', 'nand4',
       'nor2', 'nor3', 'nor4'], dtype='<U6')

We'll need to drop some failed values in our nand4 gate in order to proceed, you can see the heatmap at the bottom of notebook #3 in order to confirm the need for this

In [None]:
#these are the bad t_fall values in nand4 that need to be dropped
nand4_drop_idxs = ((df['gate_type'] == 'nand4') & (df['t_fall'] == ' FAILED')).values
nand4_drop_idxs.sum()

36

In [None]:
df = df.loc[~nand4_drop_idxs].reset_index(drop=True)
df.shape

(11304, 7)

Lets flesh out a few functions that will make our lives easier in order to index into our data appropriately and drop whatever items we might need to be removed

In [None]:
def get_gate(gate_type='buffer'): return df[df.gate_type == gate_type]

In [None]:
def get_gate_xy(gate_type='buffer'):
    y_drop = 't_fall' if gate_type=='buffer' else 't_rise'
    df = get_gate(gate_type)
    x_cols = ['clock_cycle', 'thickness', 'voltage']
    y_cols = ['t_delay', 't_rise', 't_fall']
    y_cols.remove(y_drop)
    x = np.vstack(df[x_cols].values).astype('float')
    y = np.vstack(np.vstack(df[y_cols].values)).astype('float')
    return x, y

In [None]:
x, y = get_gate_xy('nor3')
x.shape, y.shape

((1134, 3), (1134, 2))

In [None]:
gate_dict = dict.fromkeys(unique_gates)
gate_dict

{'buffer': None,
 'inv_x1': None,
 'inv_x2': None,
 'inv_x4': None,
 'nand2': None,
 'nand3': None,
 'nand4': None,
 'nor2': None,
 'nor3': None,
 'nor4': None}

In [None]:
for gate in unique_gates: 
    gate_dict[gate] = get_gate_xy(gate_type=gate)

In [None]:
x, y = gate_dict['buffer']
x_tr, x_val, y_tr, y_val = train_test_split(x,y)
x_tr.shape, x_val.shape, y_tr.shape, y_val.shape

((850, 3), (284, 3), (850, 2), (284, 2))

### Ok, we have a dictionary with our gates that we can quickly index into and do our model training and evaluation with

## Lets flesh out and then add some new models into the mix for our analysis.

We'll add the following models: 

- ElasticNet: (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html)
- Lasso: (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)
- LassoCV: (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html)
- ElasticNetCv: (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html)

To our previously used models:
- Ridge Regression: (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)
- RandomForests: (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
- XGBoost: (https://xgboost.readthedocs.io/en/stable/) --> Gradient Boosted Trees

We'll also introduce and begin to make use of using Polynomial Features (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) to help our models by introducing polynomial combinations of our independent variables (x)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet, LassoCV, MultiTaskLassoCV, MultiTaskElasticNetCV
from sklearn import linear_model

In [None]:
PolynomialFeatures(2).fit_transform(x).shape

(1134, 10)

In [None]:
#without polynomial features.....
regr = ElasticNet()
regr.fit(x_tr,y_tr);
#print(regr.coef_)
print(regr.score(x_val,y_val))

-0.0036199686533131548


In [None]:
#with polynomial features.....
regr = ElasticNet()
poly_x = PolynomialFeatures(2).fit_transform(x_tr)
regr.fit(poly_x, y_tr);
poly_x_val = PolynomialFeatures(2).fit_transform(x_val)
print(regr.score(poly_x_val, y_val))

-0.0036199686533131548


The models all follow the same pattern in terms of how they are called, so lets wrap them all into a function.
That was ugly and too much repeated code

In [None]:
do_simple_model(x,y, 'random_forest', num_poly=None, norm_y=True)

for a random_forest model, the validation score is: 1.0


In [None]:
do_simple_model(x,y, 'xgboost', num_poly=None, norm_y=True)

for a xgboost model, the validation score is: 0.9999995886556486


In [None]:
def normalize(x): return x / x.min()

def do_simple_model(x, y, model_name='ridge', num_poly=2, alpha=1.0, norm_y=False):
    x_tr, x_val, y_tr, y_val = train_test_split(x,y)
    if num_poly:
        x_tr = PolynomialFeatures(num_poly).fit_transform(x_tr)
        x_val = PolynomialFeatures(num_poly).fit_transform(x_val)
    if model_name == 'ridge': model = Ridge(alpha=alpha)
    elif 'lasso': model = linear_model.Lasso(alpha=alpha)
    elif 'elastic': model = ElasticNet()
    elif 'lasso-cv': model = MultiTaskLassoCV(cv=5, random_state=0)
    elif 'elastic-cv': model = MultiTaskElasticNetCV(cv=5, random_state=0)
    elif 'random_forest': model = RandomForestRegressor(max_depth=100, random_state=0)
    elif 'xgboost': model = xgb.XGBRegressor(eval_metric='rmse')
    if norm_y: 
        y_tr, y_val = normalize(y_tr), normalize(y_val)
    model.fit(x_tr, y_tr)
    r2score = model.score(x_val, y_val)
    print(f'for a {model_name} model, the validation score is: {r2score}')

In [None]:
def run_models(gate_type='buffer', random_state=0, normalize_y=True):
    print(f'-----Analyzing {gate_type} gates-----')
    x, y = gate_dict[gate_type]
    do_simple_model(x, y, model_name='ridge')
    do_simple_model(x, y, model_name='lasso')
    #do_simple_model(x, y, model_name='elastic')
    do_simple_model(x, y, model_name='lasso-cv')
    do_simple_model(x, y, model_name='elastic-cv')
    do_simple_model(x, y, model_name='random_forest', norm_y=True, num_poly=0)
    do_simple_model(x, y, model_name='xgboost', norm_y=True, num_poly=0)
    print('\n')

In [None]:
#test run on just the buffer gate
run_models()

-----Analyzing buffer gates-----
for a ridge model, the validation score is: 0.7926424083227464
for a lasso model, the validation score is: -0.002686280311227085
for a lasso-cv model, the validation score is: -0.008146830944338768
for a elastic-cv model, the validation score is: -0.023607705305650928
for a random_forest model, the validation score is: 0.6397995094142909
for a xgboost model, the validation score is: 0.6717446251421435




In [None]:
for gate_type in gate_dict.keys(): run_models(gate_type, random_state=123)

-----Analyzing buffer gates-----
for a ridge model, the validation score is: 0.8222329564046105
for a lasso-cv model, the validation score is: 0.5947528630137102
for a elastic-cv model, the validation score is: 0.5831564609135782
for a random_forest model, the validation score is: 1.0
for a xgboost model, the validation score is: 0.9999997444053901


-----Analyzing inv_x1 gates-----
for a ridge model, the validation score is: 0.8728247712581148
for a lasso-cv model, the validation score is: 0.6285965653386434
for a elastic-cv model, the validation score is: 0.6670710824457344
for a random_forest model, the validation score is: 1.0
for a xgboost model, the validation score is: 0.9999986111712638


-----Analyzing inv_x2 gates-----
for a ridge model, the validation score is: 0.8723129154048755
for a lasso-cv model, the validation score is: 0.6085649832761226
for a elastic-cv model, the validation score is: 0.6109733502268793
for a random_forest model, the validation score is: 1.0
for a xg

### Looks like our Random Forest and XGBoost models are still doing the best at the moment, but will that hold after a parameter search? (all of the models are using their default parameters at the moment)

# To do:

- Plotting functionality
- Save results into Csv