In [68]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import make_regression

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

Assume X has only 1 categorical variable (GC Algorithm) and it only has 2 classes. Assume that variable is already encoded into 0 and 1.

In [9]:
rs = np.random.RandomState(seed=1)

In [45]:
X.shape

(100, 6)

In [85]:
X, y = make_regression(n_features=6, n_informative=2,random_state=rs, shuffle=False)
y = np.absolute(y)

# assuming we have 100 rows in total
X_train = X[:80]
X_test = X[80:]
y_train = y[:80]
y_test = y[80:]


In [86]:
df = pd.DataFrame(X_train, columns=["gcAlg", "driverHeap", "execHeap", "numCores", "ramSize", "dataSize"])
df["time"] = y_train
df.head()

X_df = df.iloc[:,:-1]
y_series = df['time']

Unnamed: 0,gcAlg,driverHeap,execHeap,numCores,ramSize,dataSize,time
0,-0.757263,-0.435493,-1.355148,0.732072,-1.402643,-1.289446,19.218846
1,1.108302,0.619715,-1.027266,-0.786797,-0.488268,0.359245,28.028086
2,-0.177895,-1.496273,1.514419,-0.043701,-1.267784,-0.1958,12.400262
3,-1.19833,-0.064364,-0.310832,-1.096277,0.243537,-1.372882,26.878563
4,-0.469576,-1.33794,-1.124056,-0.417228,2.070826,-1.300761,17.95839


## Linear Regression

In [96]:
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X, y))
y_pred_reg = reg.predict(X_test)
mean_squared_error(y_test, y_pred_reg)

0.029228546034133607


102.70360483262463

## LASSO

In [92]:
lasso = Lasso(random_state=rs).fit(X_df, y_series)
print(lasso.score(X_df, y_series))
print(lasso.coef_)

0.010336923009531707
[ 0.         -0.         -0.         -0.71108585 -0.          0.        ]


In [97]:
y_pred_lasso = lasso.predict(X_test)
mean_squared_error(y_test, y_pred_lasso)

102.86317554907426

In [None]:
# if we want to use it as a feature selector
model = SelectFromModel(lasso, prefit=True)
lasso_X = model.transform(X)

new_labels = []
features = X_df.columns

for i, feature in zip(model.get_support(), features): # get headers, since they get lost after lasso
    if i:
        new_labels.append(feature)

lasso_df = pd.DataFrame(lasso_X, columns=features)
lasso_df['time'] = y_series

## Random Forest

In [60]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
rf = RandomForestRegressor(random_state=rs)
rf.fit(X_df, y_series)

s = pd.DataFrame((rf.feature_importances_))
s = s.T
s.columns = X_df.columns

s

most important features: 


Unnamed: 0,gcAlg,driverHeap,execHeap,numCores,ramSize,dataSize
0,0.874584,0.052343,0.018159,0.029641,0.010825,0.014448


In [99]:
y_pred_rf = rf.predict(X_test)
y_pred_rf

array([141.13635051,  34.28200866,  99.31718274,  67.44845087,
        67.64626468,  73.0295003 ,  71.79153895,  43.58297522,
        21.73429102,  37.41333565,  42.06728991,  84.60223157,
        18.27991714, 110.62099489,  15.95942657,   8.99019717,
        20.46391465,  49.1765168 , 106.44734451,  32.80759559])

In [100]:
mean_squared_error(y_test, y_pred_rf)

2299.0120544496094