# XGBoost! 

## ...can we improve our random forest results?

"xgboost is all you need" --> https://arxiv.org/pdf/2110.01889.pdf (for tabular data)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
import xgboost as xgb
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('final.csv')
types = ['inverter', 'buffer', 'nand', 'nor']
type_dict = dict(zip(df.type.unique(), types))
df.type = df.type.map(type_dict)
df.clock_cycle = df.clock_cycle.map(lambda x: float(x[:-2]))
df['thickness tFE'] = df['thickness tFE'].map(lambda x: float(x[:-2]))
drop_50ps_idx = np.where(df['clock_cycle'] == 50)
df.drop(index=drop_50ps_idx[0], inplace=True)
buffer_df = df[df.type == 'buffer']
inverter_df = df[df.type == 'inverter']
nor_df = df[df.type == 'nor']
nand_df = df[df.type == 'nand']


In [None]:
#df.iloc[:, 4:].sum(axis=1)
df.head(2)

Unnamed: 0,type,clock_cycle,thickness tFE,Vdd,T_RISE,T_FALL,T_DELAY
0,inverter,100.0,0.0,0.3,2.410636e-12,2.278608e-12,4.401267e-12
1,inverter,100.0,0.0,0.4,2.254149e-12,2.258426e-12,2.525869e-12


In [None]:
#df.loc[[df.columns[-1]]]
#df.loc[df.columns[-1]]
df.loc[:, df.columns[4:]]

Unnamed: 0,T_RISE,T_FALL,T_DELAY
0,2.410636e-12,2.278608e-12,4.401267e-12
1,2.254149e-12,2.258426e-12,2.525869e-12
2,1.891338e-12,1.891455e-12,1.588824e-12
3,1.771861e-12,1.770314e-12,1.168491e-12
4,1.934444e-12,1.938007e-12,9.872374e-13
...,...,...,...
7527,1.219473e-11,9.754274e-12,-1.304326e-12
7528,1.240898e-11,1.007428e-11,-3.400690e-12
7529,1.357794e-11,1.163782e-11,-5.207976e-12
7530,1.502617e-11,1.367299e-11,-6.807476e-12


In [None]:
def get_xy_df(df):
    X_buffer = np.vstack([df.clock_cycle, df['thickness tFE'], df['Vdd']]).T
    y_buffer = np.vstack([df['T_RISE '], df['T_FALL '], df['T_DELAY ']]).T
    return X_buffer, y_buffer

X_buffer, y_buffer = get_xy_df(buffer_df)
X_inverter, y_inverter = get_xy_df(inverter_df)
X_nor, y_nor = get_xy_df(nor_df)
X_nand, y_nand = get_xy_df(nand_df)
X_buffer.shape, y_buffer.shape

((1673, 3), (1673, 3))

array([[1.e+02, 0.e+00, 3.e-01],
       [1.e+02, 0.e+00, 4.e-01],
       [1.e+02, 0.e+00, 5.e-01],
       ...,
       [9.e+02, 9.e+00, 7.e-01],
       [9.e+02, 9.e+00, 8.e-01],
       [9.e+02, 9.e+00, 9.e-01]])

In [None]:
regressor=xgb.XGBRegressor(eval_metric='rmse')

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X_buffer, y_buffer, test_size=0.1)    

In [None]:
from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}

# try out every combination of the above values
search = GridSearchCV(regressor, param_grid, cv=5).fit(x_train, y_train)

print("The best hyperparameters are ", search.best_params_)

The best hyperparameters are  {'learning_rate': 0.015, 'max_depth': 4, 'n_estimators': 700}


In [None]:
#regressor.fit(X_buffer, y_buffer)

In [None]:
regressor=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
                           n_estimators  = search.best_params_["n_estimators"],
                           max_depth     = search.best_params_["max_depth"],)

In [None]:
preds = regressor.predict(X_buffer)
preds.shape

(1673, 3)

In [None]:
regressor.score(X_buffer, y_buffer) ** 2, mean_squared_error(y_buffer,preds)

(5.231047707025684e-19, 4.8802681707286987e-23)

These two values above should be the same?

Lets keep our random forest for comparison

In [None]:
def do_rf(x_train, y_train, x_valid, y_valid):
    print('\n\ndoing RF.....')
    RF = RandomForestRegressor(max_depth=100, random_state=0)
    RF.fit(x_train, y_train)
    train_score = RF.score(x_train, y_train)
    y_pred = RF.predict(x_valid)
    mse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred) # coefficient of determination
    valid_score = RF.score(x_valid, y_valid)

    print(f'the training score is {train_score:3f}')
    print(f'the validation score is {valid_score:3f}')
    print(f'the mse is {mse:3f}')
    print(f"the r2 score is: {r2:3f}")
    return y_pred

def do_xgb(x_train, y_train, x_valid, y_valid):
    print('\n\nboosting the trees.....')
    regressor=xgb.XGBRegressor(eval_metric='rmse')
    regressor.fit(x_train, y_train)
    train_score = regressor.score(x_train, y_train) ** 2 #bc using RMSE
    y_pred = regressor.predict(x_valid)
    mse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred) # coefficient of determination
    valid_score = regressor.score(x_valid, y_valid)

    print(f'the training score is {train_score:3f}')    
    print(f"the validation score is {valid_score:3f}")
    print(f'the mse is {mse:3f}')
    print(f"the r2 score is: {r2:3f}")
    return y_pred

def train_models(x, y, random_state=0):
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, 
                                                        random_state=random_state, 
                                                        test_size=0.1)    
    y_pred = do_rf(x_train, y_train, x_valid, y_valid)
    _ = do_xgb(x_train, y_train, x_valid, y_valid)
    return y_pred

### Lets start with our Buffer circuit

In [None]:
_ = train_models(X_buffer, y_buffer/y_nor.min())



doing RF.....
the training score is 0.999672
the validation score is 0.998115
the mse is 0.001012
the r2 score is: 0.998115


boosting the trees.....
the training score is 0.999900
the validation score is 0.999629
the mse is 0.000198
the r2 score is: 0.999629


### Inverter

In [None]:
_ = train_models(X_inverter, y_inverter/y_inverter.min(), random_state=0)



doing RF.....
the training score is 0.999537
the validation score is 0.997311
the mse is 0.187244
the r2 score is: 0.997311


boosting the trees.....
the training score is 0.999910
the validation score is 0.999442
the mse is 0.042107
the r2 score is: 0.999442


### Nand

In [None]:
_ = train_models(X_nand, y_nand/y_nand.min(), random_state=1)



doing RF.....
the training score is 0.999502
the validation score is 0.996710
the mse is 0.067688
the r2 score is: 0.996710


boosting the trees.....
the training score is 0.999933
the validation score is 0.999440
the mse is 0.012227
the r2 score is: 0.999440


### Nor

In [None]:
_ = train_models(X_nor, y_nor/y_nor.min())



doing RF.....
the training score is 0.999497
the validation score is 0.997362
the mse is 0.001057
the r2 score is: 0.997362


boosting the trees.....
the training score is 0.999862
the validation score is 0.999463
the mse is 0.000218
the r2 score is: 0.999463
